From e15af206c133896130d2ca43274e90a1dc7cdd8f Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Mon, 7 Nov 2022 16:12:17 -0600 Subject: [PATCH 01/43] feat: ported msm-affine --- ecc/bls12-377/g1.go | 74 + ecc/bls12-377/g2.go | 74 + ecc/bls12-377/multiexp_affine.go | 1883 +++++++++++++++++ ecc/bls12-377/multiexp_test.go | 84 +- ecc/bls12-378/g1.go | 74 + ecc/bls12-378/g2.go | 74 + ecc/bls12-378/multiexp_affine.go | 1883 +++++++++++++++++ ecc/bls12-378/multiexp_test.go | 84 +- ecc/bls12-381/g1.go | 74 + ecc/bls12-381/g2.go | 74 + ecc/bls12-381/multiexp_affine.go | 1883 +++++++++++++++++ ecc/bls12-381/multiexp_test.go | 84 +- ecc/bls24-315/g1.go | 74 + ecc/bls24-315/g2.go | 74 + ecc/bls24-315/multiexp_affine.go | 1883 +++++++++++++++++ ecc/bls24-315/multiexp_test.go | 84 +- ecc/bls24-317/g1.go | 74 + ecc/bls24-317/g2.go | 74 + ecc/bls24-317/multiexp_affine.go | 1883 +++++++++++++++++ ecc/bls24-317/multiexp_test.go | 84 +- ecc/bn254/g1.go | 74 + ecc/bn254/g2.go | 74 + ecc/bn254/multiexp_affine.go | 1883 +++++++++++++++++ ecc/bn254/multiexp_test.go | 84 +- ecc/bw6-633/g1.go | 74 + ecc/bw6-633/g2.go | 74 + ecc/bw6-633/multiexp_affine.go | 857 ++++++++ ecc/bw6-633/multiexp_test.go | 84 +- ecc/bw6-756/g1.go | 74 + ecc/bw6-756/g2.go | 74 + ecc/bw6-756/multiexp_affine.go | 857 ++++++++ ecc/bw6-756/multiexp_test.go | 84 +- ecc/bw6-761/g1.go | 74 + ecc/bw6-761/g2.go | 74 + ecc/bw6-761/multiexp_affine.go | 857 ++++++++ ecc/bw6-761/multiexp_test.go | 84 +- internal/generator/ecc/generate.go | 1 + .../ecc/template/multiexp_affine.go.tmpl | 474 +++++ internal/generator/ecc/template/point.go.tmpl | 79 + .../ecc/template/tests/multiexp.go.tmpl | 43 +- 40 files changed, 16535 insertions(+), 19 deletions(-) create mode 100644 ecc/bls12-377/multiexp_affine.go create mode 100644 ecc/bls12-378/multiexp_affine.go create mode 100644 ecc/bls12-381/multiexp_affine.go create mode 100644 ecc/bls24-315/multiexp_affine.go create mode 100644 ecc/bls24-317/multiexp_affine.go create mode 100644 ecc/bn254/multiexp_affine.go create mode 100644 ecc/bw6-633/multiexp_affine.go create mode 100644 ecc/bw6-756/multiexp_affine.go create mode 100644 ecc/bw6-761/multiexp_affine.go create mode 100644 internal/generator/ecc/template/multiexp_affine.go.tmpl diff --git a/ecc/bls12-377/g1.go b/ecc/bls12-377/g1.go index 28be402bbf..bc9027480a 100644 --- a/ecc/bls12-377/g1.go +++ b/ecc/bls12-377/g1.go @@ -979,3 +979,77 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin toReturnAff := BatchJacobianToAffineG1(toReturn) return toReturnAff } + +// batch add/dbl in affine coordinates +// using batch inversion +// cost add: 5*batchSize M + 1I, dbl: +1M +func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { + if batchSize == 0 { + return + } + var isDbl [MAX_BATCH_SIZE]bool + var lambda [MAX_BATCH_SIZE]fp.Element + + { + var lambdain [MAX_BATCH_SIZE]fp.Element + + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + } + + // invert denominator + BatchInvertG1Affine(&lambda, &lambdain, batchSize) + + } + + var d fp.Element + var rr G1Affine + + for j := 0; j < batchSize; j++ { + // computa lambda, distinguishing dbl / add + if isDbl[j] { + d.Square(&P[j].X) + lambda[j].Mul(&lambda[j], &d) + d.Double(&lambda[j]) + lambda[j].Add(&lambda[j], &d) + } else { + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) + } + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[j].X) + rr.X.Sub(&rr.X, &P[j].X) + d.Sub(&R[j].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[j].Y) + R[j].Set(&rr) + } +} + +// batch inversion +// similar to BatchInvertfp.Element, ignores edge cases +func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { + + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < n; i++ { + res[i] = accumulator + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := n - 1; i >= 0; i-- { + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } +} diff --git a/ecc/bls12-377/g2.go b/ecc/bls12-377/g2.go index 8b8ecb3161..fdf535ca82 100644 --- a/ecc/bls12-377/g2.go +++ b/ecc/bls12-377/g2.go @@ -975,3 +975,77 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin }) return toReturn } + +// batch add/dbl in affine coordinates +// using batch inversion +// cost add: 5*batchSize M + 1I, dbl: +1M +func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { + if batchSize == 0 { + return + } + var isDbl [MAX_BATCH_SIZE]bool + var lambda [MAX_BATCH_SIZE]fptower.E2 + + { + var lambdain [MAX_BATCH_SIZE]fptower.E2 + + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + } + + // invert denominator + BatchInvertG2Affine(&lambda, &lambdain, batchSize) + + } + + var d fptower.E2 + var rr G2Affine + + for j := 0; j < batchSize; j++ { + // computa lambda, distinguishing dbl / add + if isDbl[j] { + d.Square(&P[j].X) + lambda[j].Mul(&lambda[j], &d) + d.Double(&lambda[j]) + lambda[j].Add(&lambda[j], &d) + } else { + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) + } + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[j].X) + rr.X.Sub(&rr.X, &P[j].X) + d.Sub(&R[j].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[j].Y) + R[j].Set(&rr) + } +} + +// batch inversion +// similar to BatchInvertfptower.E2, ignores edge cases +func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E2, n int) { + + var accumulator fptower.E2 + accumulator.SetOne() + + for i := 0; i < n; i++ { + res[i] = accumulator + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := n - 1; i >= 0; i-- { + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } +} diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go new file mode 100644 index 0000000000..876ae12f01 --- /dev/null +++ b/ecc/bls12-377/multiexp_affine.go @@ -0,0 +1,1883 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package bls12377 + +import ( + "errors" + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" + "math" + "runtime" +) + +const MAX_BATCH_SIZE = 600 + +type batchOp struct { + bucketID, pointID uint32 +} + +func (o batchOp) isNeg() bool { + return o.pointID&1 == 1 +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { + var _p G1Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]G1Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.msmC4(points, scalars, splitFirstChunk) + + case 5: + p.msmC5(points, scalars, splitFirstChunk) + + case 6: + p.msmC6(points, scalars, splitFirstChunk) + + case 7: + p.msmC7(points, scalars, splitFirstChunk) + + case 8: + p.msmC8(points, scalars, splitFirstChunk) + + case 9: + p.msmC9(points, scalars, splitFirstChunk) + + case 10: + p.batchAffineMsmC10(points, scalars, splitFirstChunk) + + case 11: + p.batchAffineMsmC11(points, scalars, splitFirstChunk) + + case 12: + p.batchAffineMsmC12(points, scalars, splitFirstChunk) + + case 13: + p.batchAffineMsmC13(points, scalars, splitFirstChunk) + + case 14: + p.batchAffineMsmC14(points, scalars, splitFirstChunk) + + case 15: + p.batchAffineMsmC15(points, scalars, splitFirstChunk) + + case 16: + p.batchAffineMsmC16(points, scalars, splitFirstChunk) + + case 20: + p.batchAffineMsmC20(points, scalars, splitFirstChunk) + + case 21: + p.batchAffineMsmC21(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { + var _p g1JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + +type BatchG1Affine struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + buckets, points []G1Affine +} + +func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine { + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG1Affine{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(buckets)/2), + } +} + +func (b *BatchG1Affine) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG1Affine) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG1Affine) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG1Affine) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + B := &b.buckets[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if B.IsInfinity() { + if op.isNeg() { + B.Neg(P) + } else { + B.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if B.Equal(P) { + B.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { + B.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = B + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + +} + +func msmProcessChunkG1AffineBatchAffine(chunk uint64, + chRes chan<- g1JacExtended, + buckets []G1Affine, + c uint64, + points []G1Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + batch := newBatchG1Affine(buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG1Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. + } + + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 10 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 11 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 12 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 13 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 14 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 15 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 20 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 21 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.msmC4(points, scalars, splitFirstChunk) + + case 5: + p.msmC5(points, scalars, splitFirstChunk) + + case 6: + p.msmC6(points, scalars, splitFirstChunk) + + case 7: + p.msmC7(points, scalars, splitFirstChunk) + + case 8: + p.msmC8(points, scalars, splitFirstChunk) + + case 9: + p.msmC9(points, scalars, splitFirstChunk) + + case 10: + p.batchAffineMsmC10(points, scalars, splitFirstChunk) + + case 11: + p.batchAffineMsmC11(points, scalars, splitFirstChunk) + + case 12: + p.batchAffineMsmC12(points, scalars, splitFirstChunk) + + case 13: + p.batchAffineMsmC13(points, scalars, splitFirstChunk) + + case 14: + p.batchAffineMsmC14(points, scalars, splitFirstChunk) + + case 15: + p.batchAffineMsmC15(points, scalars, splitFirstChunk) + + case 16: + p.batchAffineMsmC16(points, scalars, splitFirstChunk) + + case 20: + p.batchAffineMsmC20(points, scalars, splitFirstChunk) + + case 21: + p.batchAffineMsmC21(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { + var _p g2JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + +type BatchG2Affine struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + buckets, points []G2Affine +} + +func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine { + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG2Affine{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(buckets)/2), + } +} + +func (b *BatchG2Affine) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG2Affine) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG2Affine) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG2Affine) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + B := &b.buckets[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if B.IsInfinity() { + if op.isNeg() { + B.Neg(P) + } else { + B.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if B.Equal(P) { + B.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { + B.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = B + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + +} + +func msmProcessChunkG2AffineBatchAffine(chunk uint64, + chRes chan<- g2JacExtended, + buckets []G2Affine, + c uint64, + points []G2Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + batch := newBatchG2Affine(buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG2Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. + } + + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 10 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 11 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 12 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 13 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 14 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 15 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 20 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 21 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index 4e83dd849e..9e40c04401 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -187,6 +187,39 @@ func TestMultiExpG1(t *testing.T) { genScalar, )) + properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // multi exp points + var samplePoints [nbSamples]G1Affine + var g G1Jac + g.Set(&g1Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g1Gen) + } + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + var result1, result2 G1Jac + for _, c := range cRange { + scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) + msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false) + msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + if !result1.Equal(&result2) { + return false + } + } + return true + }, + genScalar, + )) + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -245,12 +278,19 @@ func BenchmarkMultiExpG1(b *testing.B) { for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -478,6 +518,39 @@ func TestMultiExpG2(t *testing.T) { genScalar, )) + properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // multi exp points + var samplePoints [nbSamples]G2Affine + var g G2Jac + g.Set(&g2Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g2Gen) + } + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + var result1, result2 G2Jac + for _, c := range cRange { + scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) + msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false) + msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + if !result1.Equal(&result2) { + return false + } + } + return true + }, + genScalar, + )) + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -536,12 +609,19 @@ func BenchmarkMultiExpG2(b *testing.B) { for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bls12-378/g1.go b/ecc/bls12-378/g1.go index 17d313fedf..fd8fbe7ee0 100644 --- a/ecc/bls12-378/g1.go +++ b/ecc/bls12-378/g1.go @@ -979,3 +979,77 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin toReturnAff := BatchJacobianToAffineG1(toReturn) return toReturnAff } + +// batch add/dbl in affine coordinates +// using batch inversion +// cost add: 5*batchSize M + 1I, dbl: +1M +func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { + if batchSize == 0 { + return + } + var isDbl [MAX_BATCH_SIZE]bool + var lambda [MAX_BATCH_SIZE]fp.Element + + { + var lambdain [MAX_BATCH_SIZE]fp.Element + + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + } + + // invert denominator + BatchInvertG1Affine(&lambda, &lambdain, batchSize) + + } + + var d fp.Element + var rr G1Affine + + for j := 0; j < batchSize; j++ { + // computa lambda, distinguishing dbl / add + if isDbl[j] { + d.Square(&P[j].X) + lambda[j].Mul(&lambda[j], &d) + d.Double(&lambda[j]) + lambda[j].Add(&lambda[j], &d) + } else { + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) + } + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[j].X) + rr.X.Sub(&rr.X, &P[j].X) + d.Sub(&R[j].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[j].Y) + R[j].Set(&rr) + } +} + +// batch inversion +// similar to BatchInvertfp.Element, ignores edge cases +func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { + + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < n; i++ { + res[i] = accumulator + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := n - 1; i >= 0; i-- { + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } +} diff --git a/ecc/bls12-378/g2.go b/ecc/bls12-378/g2.go index ed5da711ae..479cda7053 100644 --- a/ecc/bls12-378/g2.go +++ b/ecc/bls12-378/g2.go @@ -975,3 +975,77 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin }) return toReturn } + +// batch add/dbl in affine coordinates +// using batch inversion +// cost add: 5*batchSize M + 1I, dbl: +1M +func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { + if batchSize == 0 { + return + } + var isDbl [MAX_BATCH_SIZE]bool + var lambda [MAX_BATCH_SIZE]fptower.E2 + + { + var lambdain [MAX_BATCH_SIZE]fptower.E2 + + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + } + + // invert denominator + BatchInvertG2Affine(&lambda, &lambdain, batchSize) + + } + + var d fptower.E2 + var rr G2Affine + + for j := 0; j < batchSize; j++ { + // computa lambda, distinguishing dbl / add + if isDbl[j] { + d.Square(&P[j].X) + lambda[j].Mul(&lambda[j], &d) + d.Double(&lambda[j]) + lambda[j].Add(&lambda[j], &d) + } else { + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) + } + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[j].X) + rr.X.Sub(&rr.X, &P[j].X) + d.Sub(&R[j].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[j].Y) + R[j].Set(&rr) + } +} + +// batch inversion +// similar to BatchInvertfptower.E2, ignores edge cases +func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E2, n int) { + + var accumulator fptower.E2 + accumulator.SetOne() + + for i := 0; i < n; i++ { + res[i] = accumulator + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := n - 1; i >= 0; i-- { + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } +} diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go new file mode 100644 index 0000000000..b92b826e91 --- /dev/null +++ b/ecc/bls12-378/multiexp_affine.go @@ -0,0 +1,1883 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package bls12378 + +import ( + "errors" + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bls12-378/fr" + "math" + "runtime" +) + +const MAX_BATCH_SIZE = 600 + +type batchOp struct { + bucketID, pointID uint32 +} + +func (o batchOp) isNeg() bool { + return o.pointID&1 == 1 +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { + var _p G1Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]G1Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.msmC4(points, scalars, splitFirstChunk) + + case 5: + p.msmC5(points, scalars, splitFirstChunk) + + case 6: + p.msmC6(points, scalars, splitFirstChunk) + + case 7: + p.msmC7(points, scalars, splitFirstChunk) + + case 8: + p.msmC8(points, scalars, splitFirstChunk) + + case 9: + p.msmC9(points, scalars, splitFirstChunk) + + case 10: + p.batchAffineMsmC10(points, scalars, splitFirstChunk) + + case 11: + p.batchAffineMsmC11(points, scalars, splitFirstChunk) + + case 12: + p.batchAffineMsmC12(points, scalars, splitFirstChunk) + + case 13: + p.batchAffineMsmC13(points, scalars, splitFirstChunk) + + case 14: + p.batchAffineMsmC14(points, scalars, splitFirstChunk) + + case 15: + p.batchAffineMsmC15(points, scalars, splitFirstChunk) + + case 16: + p.batchAffineMsmC16(points, scalars, splitFirstChunk) + + case 20: + p.batchAffineMsmC20(points, scalars, splitFirstChunk) + + case 21: + p.batchAffineMsmC21(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { + var _p g1JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + +type BatchG1Affine struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + buckets, points []G1Affine +} + +func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine { + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG1Affine{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(buckets)/2), + } +} + +func (b *BatchG1Affine) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG1Affine) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG1Affine) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG1Affine) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + B := &b.buckets[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if B.IsInfinity() { + if op.isNeg() { + B.Neg(P) + } else { + B.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if B.Equal(P) { + B.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { + B.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = B + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + +} + +func msmProcessChunkG1AffineBatchAffine(chunk uint64, + chRes chan<- g1JacExtended, + buckets []G1Affine, + c uint64, + points []G1Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + batch := newBatchG1Affine(buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG1Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. + } + + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 10 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 11 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 12 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 13 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 14 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 15 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 20 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 21 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.msmC4(points, scalars, splitFirstChunk) + + case 5: + p.msmC5(points, scalars, splitFirstChunk) + + case 6: + p.msmC6(points, scalars, splitFirstChunk) + + case 7: + p.msmC7(points, scalars, splitFirstChunk) + + case 8: + p.msmC8(points, scalars, splitFirstChunk) + + case 9: + p.msmC9(points, scalars, splitFirstChunk) + + case 10: + p.batchAffineMsmC10(points, scalars, splitFirstChunk) + + case 11: + p.batchAffineMsmC11(points, scalars, splitFirstChunk) + + case 12: + p.batchAffineMsmC12(points, scalars, splitFirstChunk) + + case 13: + p.batchAffineMsmC13(points, scalars, splitFirstChunk) + + case 14: + p.batchAffineMsmC14(points, scalars, splitFirstChunk) + + case 15: + p.batchAffineMsmC15(points, scalars, splitFirstChunk) + + case 16: + p.batchAffineMsmC16(points, scalars, splitFirstChunk) + + case 20: + p.batchAffineMsmC20(points, scalars, splitFirstChunk) + + case 21: + p.batchAffineMsmC21(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { + var _p g2JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + +type BatchG2Affine struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + buckets, points []G2Affine +} + +func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine { + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG2Affine{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(buckets)/2), + } +} + +func (b *BatchG2Affine) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG2Affine) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG2Affine) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG2Affine) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + B := &b.buckets[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if B.IsInfinity() { + if op.isNeg() { + B.Neg(P) + } else { + B.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if B.Equal(P) { + B.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { + B.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = B + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + +} + +func msmProcessChunkG2AffineBatchAffine(chunk uint64, + chRes chan<- g2JacExtended, + buckets []G2Affine, + c uint64, + points []G2Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + batch := newBatchG2Affine(buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG2Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. + } + + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 10 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 11 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 12 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 13 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 14 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 15 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 20 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 21 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index 3e2c1ae6cf..466e6499a1 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -187,6 +187,39 @@ func TestMultiExpG1(t *testing.T) { genScalar, )) + properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // multi exp points + var samplePoints [nbSamples]G1Affine + var g G1Jac + g.Set(&g1Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g1Gen) + } + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + var result1, result2 G1Jac + for _, c := range cRange { + scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) + msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false) + msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + if !result1.Equal(&result2) { + return false + } + } + return true + }, + genScalar, + )) + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -245,12 +278,19 @@ func BenchmarkMultiExpG1(b *testing.B) { for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -478,6 +518,39 @@ func TestMultiExpG2(t *testing.T) { genScalar, )) + properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // multi exp points + var samplePoints [nbSamples]G2Affine + var g G2Jac + g.Set(&g2Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g2Gen) + } + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + var result1, result2 G2Jac + for _, c := range cRange { + scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) + msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false) + msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + if !result1.Equal(&result2) { + return false + } + } + return true + }, + genScalar, + )) + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -536,12 +609,19 @@ func BenchmarkMultiExpG2(b *testing.B) { for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bls12-381/g1.go b/ecc/bls12-381/g1.go index c7291c7130..189c5ac202 100644 --- a/ecc/bls12-381/g1.go +++ b/ecc/bls12-381/g1.go @@ -979,3 +979,77 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin toReturnAff := BatchJacobianToAffineG1(toReturn) return toReturnAff } + +// batch add/dbl in affine coordinates +// using batch inversion +// cost add: 5*batchSize M + 1I, dbl: +1M +func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { + if batchSize == 0 { + return + } + var isDbl [MAX_BATCH_SIZE]bool + var lambda [MAX_BATCH_SIZE]fp.Element + + { + var lambdain [MAX_BATCH_SIZE]fp.Element + + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + } + + // invert denominator + BatchInvertG1Affine(&lambda, &lambdain, batchSize) + + } + + var d fp.Element + var rr G1Affine + + for j := 0; j < batchSize; j++ { + // computa lambda, distinguishing dbl / add + if isDbl[j] { + d.Square(&P[j].X) + lambda[j].Mul(&lambda[j], &d) + d.Double(&lambda[j]) + lambda[j].Add(&lambda[j], &d) + } else { + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) + } + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[j].X) + rr.X.Sub(&rr.X, &P[j].X) + d.Sub(&R[j].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[j].Y) + R[j].Set(&rr) + } +} + +// batch inversion +// similar to BatchInvertfp.Element, ignores edge cases +func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { + + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < n; i++ { + res[i] = accumulator + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := n - 1; i >= 0; i-- { + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } +} diff --git a/ecc/bls12-381/g2.go b/ecc/bls12-381/g2.go index 168943cd91..3473f3d002 100644 --- a/ecc/bls12-381/g2.go +++ b/ecc/bls12-381/g2.go @@ -976,3 +976,77 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin }) return toReturn } + +// batch add/dbl in affine coordinates +// using batch inversion +// cost add: 5*batchSize M + 1I, dbl: +1M +func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { + if batchSize == 0 { + return + } + var isDbl [MAX_BATCH_SIZE]bool + var lambda [MAX_BATCH_SIZE]fptower.E2 + + { + var lambdain [MAX_BATCH_SIZE]fptower.E2 + + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + } + + // invert denominator + BatchInvertG2Affine(&lambda, &lambdain, batchSize) + + } + + var d fptower.E2 + var rr G2Affine + + for j := 0; j < batchSize; j++ { + // computa lambda, distinguishing dbl / add + if isDbl[j] { + d.Square(&P[j].X) + lambda[j].Mul(&lambda[j], &d) + d.Double(&lambda[j]) + lambda[j].Add(&lambda[j], &d) + } else { + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) + } + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[j].X) + rr.X.Sub(&rr.X, &P[j].X) + d.Sub(&R[j].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[j].Y) + R[j].Set(&rr) + } +} + +// batch inversion +// similar to BatchInvertfptower.E2, ignores edge cases +func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E2, n int) { + + var accumulator fptower.E2 + accumulator.SetOne() + + for i := 0; i < n; i++ { + res[i] = accumulator + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := n - 1; i >= 0; i-- { + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } +} diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go new file mode 100644 index 0000000000..7970a61d7e --- /dev/null +++ b/ecc/bls12-381/multiexp_affine.go @@ -0,0 +1,1883 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package bls12381 + +import ( + "errors" + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bls12-381/fr" + "math" + "runtime" +) + +const MAX_BATCH_SIZE = 600 + +type batchOp struct { + bucketID, pointID uint32 +} + +func (o batchOp) isNeg() bool { + return o.pointID&1 == 1 +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { + var _p G1Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]G1Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.msmC4(points, scalars, splitFirstChunk) + + case 5: + p.msmC5(points, scalars, splitFirstChunk) + + case 6: + p.msmC6(points, scalars, splitFirstChunk) + + case 7: + p.msmC7(points, scalars, splitFirstChunk) + + case 8: + p.msmC8(points, scalars, splitFirstChunk) + + case 9: + p.msmC9(points, scalars, splitFirstChunk) + + case 10: + p.batchAffineMsmC10(points, scalars, splitFirstChunk) + + case 11: + p.batchAffineMsmC11(points, scalars, splitFirstChunk) + + case 12: + p.batchAffineMsmC12(points, scalars, splitFirstChunk) + + case 13: + p.batchAffineMsmC13(points, scalars, splitFirstChunk) + + case 14: + p.batchAffineMsmC14(points, scalars, splitFirstChunk) + + case 15: + p.batchAffineMsmC15(points, scalars, splitFirstChunk) + + case 16: + p.batchAffineMsmC16(points, scalars, splitFirstChunk) + + case 20: + p.batchAffineMsmC20(points, scalars, splitFirstChunk) + + case 21: + p.batchAffineMsmC21(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { + var _p g1JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + +type BatchG1Affine struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + buckets, points []G1Affine +} + +func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine { + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG1Affine{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(buckets)/2), + } +} + +func (b *BatchG1Affine) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG1Affine) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG1Affine) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG1Affine) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + B := &b.buckets[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if B.IsInfinity() { + if op.isNeg() { + B.Neg(P) + } else { + B.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if B.Equal(P) { + B.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { + B.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = B + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + +} + +func msmProcessChunkG1AffineBatchAffine(chunk uint64, + chRes chan<- g1JacExtended, + buckets []G1Affine, + c uint64, + points []G1Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + batch := newBatchG1Affine(buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG1Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. + } + + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 10 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 11 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 12 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 13 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 14 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 15 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 20 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 21 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.msmC4(points, scalars, splitFirstChunk) + + case 5: + p.msmC5(points, scalars, splitFirstChunk) + + case 6: + p.msmC6(points, scalars, splitFirstChunk) + + case 7: + p.msmC7(points, scalars, splitFirstChunk) + + case 8: + p.msmC8(points, scalars, splitFirstChunk) + + case 9: + p.msmC9(points, scalars, splitFirstChunk) + + case 10: + p.batchAffineMsmC10(points, scalars, splitFirstChunk) + + case 11: + p.batchAffineMsmC11(points, scalars, splitFirstChunk) + + case 12: + p.batchAffineMsmC12(points, scalars, splitFirstChunk) + + case 13: + p.batchAffineMsmC13(points, scalars, splitFirstChunk) + + case 14: + p.batchAffineMsmC14(points, scalars, splitFirstChunk) + + case 15: + p.batchAffineMsmC15(points, scalars, splitFirstChunk) + + case 16: + p.batchAffineMsmC16(points, scalars, splitFirstChunk) + + case 20: + p.batchAffineMsmC20(points, scalars, splitFirstChunk) + + case 21: + p.batchAffineMsmC21(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { + var _p g2JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + +type BatchG2Affine struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + buckets, points []G2Affine +} + +func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine { + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG2Affine{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(buckets)/2), + } +} + +func (b *BatchG2Affine) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG2Affine) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG2Affine) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG2Affine) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + B := &b.buckets[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if B.IsInfinity() { + if op.isNeg() { + B.Neg(P) + } else { + B.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if B.Equal(P) { + B.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { + B.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = B + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + +} + +func msmProcessChunkG2AffineBatchAffine(chunk uint64, + chRes chan<- g2JacExtended, + buckets []G2Affine, + c uint64, + points []G2Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + batch := newBatchG2Affine(buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG2Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. + } + + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 10 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 11 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 12 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 13 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 14 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 15 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 20 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 21 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index 0f9d048801..05f44f6112 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -187,6 +187,39 @@ func TestMultiExpG1(t *testing.T) { genScalar, )) + properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // multi exp points + var samplePoints [nbSamples]G1Affine + var g G1Jac + g.Set(&g1Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g1Gen) + } + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + var result1, result2 G1Jac + for _, c := range cRange { + scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) + msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false) + msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + if !result1.Equal(&result2) { + return false + } + } + return true + }, + genScalar, + )) + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -245,12 +278,19 @@ func BenchmarkMultiExpG1(b *testing.B) { for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -478,6 +518,39 @@ func TestMultiExpG2(t *testing.T) { genScalar, )) + properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // multi exp points + var samplePoints [nbSamples]G2Affine + var g G2Jac + g.Set(&g2Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g2Gen) + } + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + var result1, result2 G2Jac + for _, c := range cRange { + scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) + msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false) + msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + if !result1.Equal(&result2) { + return false + } + } + return true + }, + genScalar, + )) + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -536,12 +609,19 @@ func BenchmarkMultiExpG2(b *testing.B) { for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bls24-315/g1.go b/ecc/bls24-315/g1.go index bcf23fcd01..3209c210ad 100644 --- a/ecc/bls24-315/g1.go +++ b/ecc/bls24-315/g1.go @@ -981,3 +981,77 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin toReturnAff := BatchJacobianToAffineG1(toReturn) return toReturnAff } + +// batch add/dbl in affine coordinates +// using batch inversion +// cost add: 5*batchSize M + 1I, dbl: +1M +func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { + if batchSize == 0 { + return + } + var isDbl [MAX_BATCH_SIZE]bool + var lambda [MAX_BATCH_SIZE]fp.Element + + { + var lambdain [MAX_BATCH_SIZE]fp.Element + + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + } + + // invert denominator + BatchInvertG1Affine(&lambda, &lambdain, batchSize) + + } + + var d fp.Element + var rr G1Affine + + for j := 0; j < batchSize; j++ { + // computa lambda, distinguishing dbl / add + if isDbl[j] { + d.Square(&P[j].X) + lambda[j].Mul(&lambda[j], &d) + d.Double(&lambda[j]) + lambda[j].Add(&lambda[j], &d) + } else { + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) + } + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[j].X) + rr.X.Sub(&rr.X, &P[j].X) + d.Sub(&R[j].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[j].Y) + R[j].Set(&rr) + } +} + +// batch inversion +// similar to BatchInvertfp.Element, ignores edge cases +func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { + + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < n; i++ { + res[i] = accumulator + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := n - 1; i >= 0; i-- { + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } +} diff --git a/ecc/bls24-315/g2.go b/ecc/bls24-315/g2.go index d7a009999d..7f377b8147 100644 --- a/ecc/bls24-315/g2.go +++ b/ecc/bls24-315/g2.go @@ -991,3 +991,77 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin }) return toReturn } + +// batch add/dbl in affine coordinates +// using batch inversion +// cost add: 5*batchSize M + 1I, dbl: +1M +func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { + if batchSize == 0 { + return + } + var isDbl [MAX_BATCH_SIZE]bool + var lambda [MAX_BATCH_SIZE]fptower.E4 + + { + var lambdain [MAX_BATCH_SIZE]fptower.E4 + + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + } + + // invert denominator + BatchInvertG2Affine(&lambda, &lambdain, batchSize) + + } + + var d fptower.E4 + var rr G2Affine + + for j := 0; j < batchSize; j++ { + // computa lambda, distinguishing dbl / add + if isDbl[j] { + d.Square(&P[j].X) + lambda[j].Mul(&lambda[j], &d) + d.Double(&lambda[j]) + lambda[j].Add(&lambda[j], &d) + } else { + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) + } + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[j].X) + rr.X.Sub(&rr.X, &P[j].X) + d.Sub(&R[j].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[j].Y) + R[j].Set(&rr) + } +} + +// batch inversion +// similar to BatchInvertfptower.E4, ignores edge cases +func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E4, n int) { + + var accumulator fptower.E4 + accumulator.SetOne() + + for i := 0; i < n; i++ { + res[i] = accumulator + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := n - 1; i >= 0; i-- { + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } +} diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go new file mode 100644 index 0000000000..e08952a333 --- /dev/null +++ b/ecc/bls24-315/multiexp_affine.go @@ -0,0 +1,1883 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package bls24315 + +import ( + "errors" + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bls24-315/fr" + "math" + "runtime" +) + +const MAX_BATCH_SIZE = 600 + +type batchOp struct { + bucketID, pointID uint32 +} + +func (o batchOp) isNeg() bool { + return o.pointID&1 == 1 +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { + var _p G1Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]G1Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.msmC4(points, scalars, splitFirstChunk) + + case 5: + p.msmC5(points, scalars, splitFirstChunk) + + case 6: + p.msmC6(points, scalars, splitFirstChunk) + + case 7: + p.msmC7(points, scalars, splitFirstChunk) + + case 8: + p.msmC8(points, scalars, splitFirstChunk) + + case 9: + p.msmC9(points, scalars, splitFirstChunk) + + case 10: + p.batchAffineMsmC10(points, scalars, splitFirstChunk) + + case 11: + p.batchAffineMsmC11(points, scalars, splitFirstChunk) + + case 12: + p.batchAffineMsmC12(points, scalars, splitFirstChunk) + + case 13: + p.batchAffineMsmC13(points, scalars, splitFirstChunk) + + case 14: + p.batchAffineMsmC14(points, scalars, splitFirstChunk) + + case 15: + p.batchAffineMsmC15(points, scalars, splitFirstChunk) + + case 16: + p.batchAffineMsmC16(points, scalars, splitFirstChunk) + + case 20: + p.batchAffineMsmC20(points, scalars, splitFirstChunk) + + case 21: + p.batchAffineMsmC21(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { + var _p g1JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + +type BatchG1Affine struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + buckets, points []G1Affine +} + +func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine { + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG1Affine{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(buckets)/2), + } +} + +func (b *BatchG1Affine) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG1Affine) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG1Affine) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG1Affine) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + B := &b.buckets[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if B.IsInfinity() { + if op.isNeg() { + B.Neg(P) + } else { + B.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if B.Equal(P) { + B.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { + B.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = B + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + +} + +func msmProcessChunkG1AffineBatchAffine(chunk uint64, + chRes chan<- g1JacExtended, + buckets []G1Affine, + c uint64, + points []G1Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + batch := newBatchG1Affine(buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG1Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. + } + + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 10 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 11 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 12 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 13 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 14 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 15 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 20 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 21 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.msmC4(points, scalars, splitFirstChunk) + + case 5: + p.msmC5(points, scalars, splitFirstChunk) + + case 6: + p.msmC6(points, scalars, splitFirstChunk) + + case 7: + p.msmC7(points, scalars, splitFirstChunk) + + case 8: + p.msmC8(points, scalars, splitFirstChunk) + + case 9: + p.msmC9(points, scalars, splitFirstChunk) + + case 10: + p.batchAffineMsmC10(points, scalars, splitFirstChunk) + + case 11: + p.batchAffineMsmC11(points, scalars, splitFirstChunk) + + case 12: + p.batchAffineMsmC12(points, scalars, splitFirstChunk) + + case 13: + p.batchAffineMsmC13(points, scalars, splitFirstChunk) + + case 14: + p.batchAffineMsmC14(points, scalars, splitFirstChunk) + + case 15: + p.batchAffineMsmC15(points, scalars, splitFirstChunk) + + case 16: + p.batchAffineMsmC16(points, scalars, splitFirstChunk) + + case 20: + p.batchAffineMsmC20(points, scalars, splitFirstChunk) + + case 21: + p.batchAffineMsmC21(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { + var _p g2JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + +type BatchG2Affine struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + buckets, points []G2Affine +} + +func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine { + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG2Affine{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(buckets)/2), + } +} + +func (b *BatchG2Affine) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG2Affine) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG2Affine) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG2Affine) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + B := &b.buckets[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if B.IsInfinity() { + if op.isNeg() { + B.Neg(P) + } else { + B.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if B.Equal(P) { + B.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { + B.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = B + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + +} + +func msmProcessChunkG2AffineBatchAffine(chunk uint64, + chRes chan<- g2JacExtended, + buckets []G2Affine, + c uint64, + points []G2Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + batch := newBatchG2Affine(buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG2Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. + } + + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 10 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 11 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 12 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 13 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 14 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 15 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 20 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 21 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index 7cc9521976..6f5611f563 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -187,6 +187,39 @@ func TestMultiExpG1(t *testing.T) { genScalar, )) + properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // multi exp points + var samplePoints [nbSamples]G1Affine + var g G1Jac + g.Set(&g1Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g1Gen) + } + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + var result1, result2 G1Jac + for _, c := range cRange { + scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) + msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false) + msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + if !result1.Equal(&result2) { + return false + } + } + return true + }, + genScalar, + )) + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -245,12 +278,19 @@ func BenchmarkMultiExpG1(b *testing.B) { for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -478,6 +518,39 @@ func TestMultiExpG2(t *testing.T) { genScalar, )) + properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // multi exp points + var samplePoints [nbSamples]G2Affine + var g G2Jac + g.Set(&g2Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g2Gen) + } + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + var result1, result2 G2Jac + for _, c := range cRange { + scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) + msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false) + msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + if !result1.Equal(&result2) { + return false + } + } + return true + }, + genScalar, + )) + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -536,12 +609,19 @@ func BenchmarkMultiExpG2(b *testing.B) { for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bls24-317/g1.go b/ecc/bls24-317/g1.go index b740a62a2e..a7198ef2ea 100644 --- a/ecc/bls24-317/g1.go +++ b/ecc/bls24-317/g1.go @@ -981,3 +981,77 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin toReturnAff := BatchJacobianToAffineG1(toReturn) return toReturnAff } + +// batch add/dbl in affine coordinates +// using batch inversion +// cost add: 5*batchSize M + 1I, dbl: +1M +func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { + if batchSize == 0 { + return + } + var isDbl [MAX_BATCH_SIZE]bool + var lambda [MAX_BATCH_SIZE]fp.Element + + { + var lambdain [MAX_BATCH_SIZE]fp.Element + + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + } + + // invert denominator + BatchInvertG1Affine(&lambda, &lambdain, batchSize) + + } + + var d fp.Element + var rr G1Affine + + for j := 0; j < batchSize; j++ { + // computa lambda, distinguishing dbl / add + if isDbl[j] { + d.Square(&P[j].X) + lambda[j].Mul(&lambda[j], &d) + d.Double(&lambda[j]) + lambda[j].Add(&lambda[j], &d) + } else { + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) + } + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[j].X) + rr.X.Sub(&rr.X, &P[j].X) + d.Sub(&R[j].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[j].Y) + R[j].Set(&rr) + } +} + +// batch inversion +// similar to BatchInvertfp.Element, ignores edge cases +func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { + + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < n; i++ { + res[i] = accumulator + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := n - 1; i >= 0; i-- { + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } +} diff --git a/ecc/bls24-317/g2.go b/ecc/bls24-317/g2.go index 35d76023aa..907c1db13b 100644 --- a/ecc/bls24-317/g2.go +++ b/ecc/bls24-317/g2.go @@ -991,3 +991,77 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin }) return toReturn } + +// batch add/dbl in affine coordinates +// using batch inversion +// cost add: 5*batchSize M + 1I, dbl: +1M +func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { + if batchSize == 0 { + return + } + var isDbl [MAX_BATCH_SIZE]bool + var lambda [MAX_BATCH_SIZE]fptower.E4 + + { + var lambdain [MAX_BATCH_SIZE]fptower.E4 + + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + } + + // invert denominator + BatchInvertG2Affine(&lambda, &lambdain, batchSize) + + } + + var d fptower.E4 + var rr G2Affine + + for j := 0; j < batchSize; j++ { + // computa lambda, distinguishing dbl / add + if isDbl[j] { + d.Square(&P[j].X) + lambda[j].Mul(&lambda[j], &d) + d.Double(&lambda[j]) + lambda[j].Add(&lambda[j], &d) + } else { + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) + } + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[j].X) + rr.X.Sub(&rr.X, &P[j].X) + d.Sub(&R[j].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[j].Y) + R[j].Set(&rr) + } +} + +// batch inversion +// similar to BatchInvertfptower.E4, ignores edge cases +func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E4, n int) { + + var accumulator fptower.E4 + accumulator.SetOne() + + for i := 0; i < n; i++ { + res[i] = accumulator + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := n - 1; i >= 0; i-- { + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } +} diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go new file mode 100644 index 0000000000..6623e42510 --- /dev/null +++ b/ecc/bls24-317/multiexp_affine.go @@ -0,0 +1,1883 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package bls24317 + +import ( + "errors" + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bls24-317/fr" + "math" + "runtime" +) + +const MAX_BATCH_SIZE = 600 + +type batchOp struct { + bucketID, pointID uint32 +} + +func (o batchOp) isNeg() bool { + return o.pointID&1 == 1 +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { + var _p G1Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]G1Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.msmC4(points, scalars, splitFirstChunk) + + case 5: + p.msmC5(points, scalars, splitFirstChunk) + + case 6: + p.msmC6(points, scalars, splitFirstChunk) + + case 7: + p.msmC7(points, scalars, splitFirstChunk) + + case 8: + p.msmC8(points, scalars, splitFirstChunk) + + case 9: + p.msmC9(points, scalars, splitFirstChunk) + + case 10: + p.batchAffineMsmC10(points, scalars, splitFirstChunk) + + case 11: + p.batchAffineMsmC11(points, scalars, splitFirstChunk) + + case 12: + p.batchAffineMsmC12(points, scalars, splitFirstChunk) + + case 13: + p.batchAffineMsmC13(points, scalars, splitFirstChunk) + + case 14: + p.batchAffineMsmC14(points, scalars, splitFirstChunk) + + case 15: + p.batchAffineMsmC15(points, scalars, splitFirstChunk) + + case 16: + p.batchAffineMsmC16(points, scalars, splitFirstChunk) + + case 20: + p.batchAffineMsmC20(points, scalars, splitFirstChunk) + + case 21: + p.batchAffineMsmC21(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { + var _p g1JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + +type BatchG1Affine struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + buckets, points []G1Affine +} + +func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine { + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG1Affine{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(buckets)/2), + } +} + +func (b *BatchG1Affine) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG1Affine) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG1Affine) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG1Affine) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + B := &b.buckets[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if B.IsInfinity() { + if op.isNeg() { + B.Neg(P) + } else { + B.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if B.Equal(P) { + B.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { + B.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = B + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + +} + +func msmProcessChunkG1AffineBatchAffine(chunk uint64, + chRes chan<- g1JacExtended, + buckets []G1Affine, + c uint64, + points []G1Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + batch := newBatchG1Affine(buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG1Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. + } + + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 10 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 11 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 12 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 13 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 14 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 15 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 20 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 21 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.msmC4(points, scalars, splitFirstChunk) + + case 5: + p.msmC5(points, scalars, splitFirstChunk) + + case 6: + p.msmC6(points, scalars, splitFirstChunk) + + case 7: + p.msmC7(points, scalars, splitFirstChunk) + + case 8: + p.msmC8(points, scalars, splitFirstChunk) + + case 9: + p.msmC9(points, scalars, splitFirstChunk) + + case 10: + p.batchAffineMsmC10(points, scalars, splitFirstChunk) + + case 11: + p.batchAffineMsmC11(points, scalars, splitFirstChunk) + + case 12: + p.batchAffineMsmC12(points, scalars, splitFirstChunk) + + case 13: + p.batchAffineMsmC13(points, scalars, splitFirstChunk) + + case 14: + p.batchAffineMsmC14(points, scalars, splitFirstChunk) + + case 15: + p.batchAffineMsmC15(points, scalars, splitFirstChunk) + + case 16: + p.batchAffineMsmC16(points, scalars, splitFirstChunk) + + case 20: + p.batchAffineMsmC20(points, scalars, splitFirstChunk) + + case 21: + p.batchAffineMsmC21(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { + var _p g2JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + +type BatchG2Affine struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + buckets, points []G2Affine +} + +func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine { + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG2Affine{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(buckets)/2), + } +} + +func (b *BatchG2Affine) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG2Affine) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG2Affine) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG2Affine) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + B := &b.buckets[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if B.IsInfinity() { + if op.isNeg() { + B.Neg(P) + } else { + B.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if B.Equal(P) { + B.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { + B.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = B + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + +} + +func msmProcessChunkG2AffineBatchAffine(chunk uint64, + chRes chan<- g2JacExtended, + buckets []G2Affine, + c uint64, + points []G2Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + batch := newBatchG2Affine(buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG2Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. + } + + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 10 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 11 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 12 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 13 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 14 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 15 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 20 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 21 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index 3f435be696..b89f8a2375 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -187,6 +187,39 @@ func TestMultiExpG1(t *testing.T) { genScalar, )) + properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // multi exp points + var samplePoints [nbSamples]G1Affine + var g G1Jac + g.Set(&g1Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g1Gen) + } + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + var result1, result2 G1Jac + for _, c := range cRange { + scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) + msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false) + msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + if !result1.Equal(&result2) { + return false + } + } + return true + }, + genScalar, + )) + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -245,12 +278,19 @@ func BenchmarkMultiExpG1(b *testing.B) { for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -478,6 +518,39 @@ func TestMultiExpG2(t *testing.T) { genScalar, )) + properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // multi exp points + var samplePoints [nbSamples]G2Affine + var g G2Jac + g.Set(&g2Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g2Gen) + } + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + var result1, result2 G2Jac + for _, c := range cRange { + scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) + msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false) + msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + if !result1.Equal(&result2) { + return false + } + } + return true + }, + genScalar, + )) + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -536,12 +609,19 @@ func BenchmarkMultiExpG2(b *testing.B) { for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bn254/g1.go b/ecc/bn254/g1.go index 05c3663fce..4056491a53 100644 --- a/ecc/bn254/g1.go +++ b/ecc/bn254/g1.go @@ -951,3 +951,77 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin toReturnAff := BatchJacobianToAffineG1(toReturn) return toReturnAff } + +// batch add/dbl in affine coordinates +// using batch inversion +// cost add: 5*batchSize M + 1I, dbl: +1M +func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { + if batchSize == 0 { + return + } + var isDbl [MAX_BATCH_SIZE]bool + var lambda [MAX_BATCH_SIZE]fp.Element + + { + var lambdain [MAX_BATCH_SIZE]fp.Element + + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + } + + // invert denominator + BatchInvertG1Affine(&lambda, &lambdain, batchSize) + + } + + var d fp.Element + var rr G1Affine + + for j := 0; j < batchSize; j++ { + // computa lambda, distinguishing dbl / add + if isDbl[j] { + d.Square(&P[j].X) + lambda[j].Mul(&lambda[j], &d) + d.Double(&lambda[j]) + lambda[j].Add(&lambda[j], &d) + } else { + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) + } + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[j].X) + rr.X.Sub(&rr.X, &P[j].X) + d.Sub(&R[j].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[j].Y) + R[j].Set(&rr) + } +} + +// batch inversion +// similar to BatchInvertfp.Element, ignores edge cases +func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { + + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < n; i++ { + res[i] = accumulator + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := n - 1; i >= 0; i-- { + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } +} diff --git a/ecc/bn254/g2.go b/ecc/bn254/g2.go index db2118d180..deeb006578 100644 --- a/ecc/bn254/g2.go +++ b/ecc/bn254/g2.go @@ -980,3 +980,77 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin }) return toReturn } + +// batch add/dbl in affine coordinates +// using batch inversion +// cost add: 5*batchSize M + 1I, dbl: +1M +func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { + if batchSize == 0 { + return + } + var isDbl [MAX_BATCH_SIZE]bool + var lambda [MAX_BATCH_SIZE]fptower.E2 + + { + var lambdain [MAX_BATCH_SIZE]fptower.E2 + + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + } + + // invert denominator + BatchInvertG2Affine(&lambda, &lambdain, batchSize) + + } + + var d fptower.E2 + var rr G2Affine + + for j := 0; j < batchSize; j++ { + // computa lambda, distinguishing dbl / add + if isDbl[j] { + d.Square(&P[j].X) + lambda[j].Mul(&lambda[j], &d) + d.Double(&lambda[j]) + lambda[j].Add(&lambda[j], &d) + } else { + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) + } + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[j].X) + rr.X.Sub(&rr.X, &P[j].X) + d.Sub(&R[j].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[j].Y) + R[j].Set(&rr) + } +} + +// batch inversion +// similar to BatchInvertfptower.E2, ignores edge cases +func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E2, n int) { + + var accumulator fptower.E2 + accumulator.SetOne() + + for i := 0; i < n; i++ { + res[i] = accumulator + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := n - 1; i >= 0; i-- { + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } +} diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go new file mode 100644 index 0000000000..d91b3cb89c --- /dev/null +++ b/ecc/bn254/multiexp_affine.go @@ -0,0 +1,1883 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package bn254 + +import ( + "errors" + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bn254/fr" + "math" + "runtime" +) + +const MAX_BATCH_SIZE = 600 + +type batchOp struct { + bucketID, pointID uint32 +} + +func (o batchOp) isNeg() bool { + return o.pointID&1 == 1 +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { + var _p G1Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]G1Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.msmC4(points, scalars, splitFirstChunk) + + case 5: + p.msmC5(points, scalars, splitFirstChunk) + + case 6: + p.msmC6(points, scalars, splitFirstChunk) + + case 7: + p.msmC7(points, scalars, splitFirstChunk) + + case 8: + p.msmC8(points, scalars, splitFirstChunk) + + case 9: + p.msmC9(points, scalars, splitFirstChunk) + + case 10: + p.batchAffineMsmC10(points, scalars, splitFirstChunk) + + case 11: + p.batchAffineMsmC11(points, scalars, splitFirstChunk) + + case 12: + p.batchAffineMsmC12(points, scalars, splitFirstChunk) + + case 13: + p.batchAffineMsmC13(points, scalars, splitFirstChunk) + + case 14: + p.batchAffineMsmC14(points, scalars, splitFirstChunk) + + case 15: + p.batchAffineMsmC15(points, scalars, splitFirstChunk) + + case 16: + p.batchAffineMsmC16(points, scalars, splitFirstChunk) + + case 20: + p.batchAffineMsmC20(points, scalars, splitFirstChunk) + + case 21: + p.batchAffineMsmC21(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { + var _p g1JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + +type BatchG1Affine struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + buckets, points []G1Affine +} + +func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine { + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG1Affine{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(buckets)/2), + } +} + +func (b *BatchG1Affine) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG1Affine) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG1Affine) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG1Affine) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + B := &b.buckets[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if B.IsInfinity() { + if op.isNeg() { + B.Neg(P) + } else { + B.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if B.Equal(P) { + B.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { + B.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = B + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + +} + +func msmProcessChunkG1AffineBatchAffine(chunk uint64, + chRes chan<- g1JacExtended, + buckets []G1Affine, + c uint64, + points []G1Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + batch := newBatchG1Affine(buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG1Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. + } + + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 10 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 11 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 12 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 13 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 14 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 15 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 20 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 21 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G1Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g1JacExtended + msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.msmC4(points, scalars, splitFirstChunk) + + case 5: + p.msmC5(points, scalars, splitFirstChunk) + + case 6: + p.msmC6(points, scalars, splitFirstChunk) + + case 7: + p.msmC7(points, scalars, splitFirstChunk) + + case 8: + p.msmC8(points, scalars, splitFirstChunk) + + case 9: + p.msmC9(points, scalars, splitFirstChunk) + + case 10: + p.batchAffineMsmC10(points, scalars, splitFirstChunk) + + case 11: + p.batchAffineMsmC11(points, scalars, splitFirstChunk) + + case 12: + p.batchAffineMsmC12(points, scalars, splitFirstChunk) + + case 13: + p.batchAffineMsmC13(points, scalars, splitFirstChunk) + + case 14: + p.batchAffineMsmC14(points, scalars, splitFirstChunk) + + case 15: + p.batchAffineMsmC15(points, scalars, splitFirstChunk) + + case 16: + p.batchAffineMsmC16(points, scalars, splitFirstChunk) + + case 20: + p.batchAffineMsmC20(points, scalars, splitFirstChunk) + + case 21: + p.batchAffineMsmC21(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { + var _p g2JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + +type BatchG2Affine struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + buckets, points []G2Affine +} + +func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine { + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG2Affine{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(buckets)/2), + } +} + +func (b *BatchG2Affine) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG2Affine) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG2Affine) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG2Affine) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + B := &b.buckets[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if B.IsInfinity() { + if op.isNeg() { + B.Neg(P) + } else { + B.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if B.Equal(P) { + B.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { + B.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = B + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + +} + +func msmProcessChunkG2AffineBatchAffine(chunk uint64, + chRes chan<- g2JacExtended, + buckets []G2Affine, + c uint64, + points []G2Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + batch := newBatchG2Affine(buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG2Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. + } + + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 10 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 11 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 12 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 13 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 14 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 15 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 20 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} + +func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 21 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []G2Affine, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]g2JacExtended + msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index 16db1a5bbf..61341020f0 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -187,6 +187,39 @@ func TestMultiExpG1(t *testing.T) { genScalar, )) + properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // multi exp points + var samplePoints [nbSamples]G1Affine + var g G1Jac + g.Set(&g1Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g1Gen) + } + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + var result1, result2 G1Jac + for _, c := range cRange { + scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) + msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false) + msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + if !result1.Equal(&result2) { + return false + } + } + return true + }, + genScalar, + )) + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -245,12 +278,19 @@ func BenchmarkMultiExpG1(b *testing.B) { for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -478,6 +518,39 @@ func TestMultiExpG2(t *testing.T) { genScalar, )) + properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // multi exp points + var samplePoints [nbSamples]G2Affine + var g G2Jac + g.Set(&g2Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g2Gen) + } + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + var result1, result2 G2Jac + for _, c := range cRange { + scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) + msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false) + msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + if !result1.Equal(&result2) { + return false + } + } + return true + }, + genScalar, + )) + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -536,12 +609,19 @@ func BenchmarkMultiExpG2(b *testing.B) { for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bw6-633/g1.go b/ecc/bw6-633/g1.go index d8a25ab2a8..f70d2b30cc 100644 --- a/ecc/bw6-633/g1.go +++ b/ecc/bw6-633/g1.go @@ -1083,3 +1083,77 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin toReturnAff := BatchJacobianToAffineG1(toReturn) return toReturnAff } + +// batch add/dbl in affine coordinates +// using batch inversion +// cost add: 5*batchSize M + 1I, dbl: +1M +func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { + if batchSize == 0 { + return + } + var isDbl [MAX_BATCH_SIZE]bool + var lambda [MAX_BATCH_SIZE]fp.Element + + { + var lambdain [MAX_BATCH_SIZE]fp.Element + + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + } + + // invert denominator + BatchInvertG1Affine(&lambda, &lambdain, batchSize) + + } + + var d fp.Element + var rr G1Affine + + for j := 0; j < batchSize; j++ { + // computa lambda, distinguishing dbl / add + if isDbl[j] { + d.Square(&P[j].X) + lambda[j].Mul(&lambda[j], &d) + d.Double(&lambda[j]) + lambda[j].Add(&lambda[j], &d) + } else { + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) + } + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[j].X) + rr.X.Sub(&rr.X, &P[j].X) + d.Sub(&R[j].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[j].Y) + R[j].Set(&rr) + } +} + +// batch inversion +// similar to BatchInvertfp.Element, ignores edge cases +func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { + + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < n; i++ { + res[i] = accumulator + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := n - 1; i >= 0; i-- { + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } +} diff --git a/ecc/bw6-633/g2.go b/ecc/bw6-633/g2.go index 8019989178..f9284d2ec7 100644 --- a/ecc/bw6-633/g2.go +++ b/ecc/bw6-633/g2.go @@ -946,3 +946,77 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin }) return toReturn } + +// batch add/dbl in affine coordinates +// using batch inversion +// cost add: 5*batchSize M + 1I, dbl: +1M +func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { + if batchSize == 0 { + return + } + var isDbl [MAX_BATCH_SIZE]bool + var lambda [MAX_BATCH_SIZE]fp.Element + + { + var lambdain [MAX_BATCH_SIZE]fp.Element + + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + } + + // invert denominator + BatchInvertG2Affine(&lambda, &lambdain, batchSize) + + } + + var d fp.Element + var rr G2Affine + + for j := 0; j < batchSize; j++ { + // computa lambda, distinguishing dbl / add + if isDbl[j] { + d.Square(&P[j].X) + lambda[j].Mul(&lambda[j], &d) + d.Double(&lambda[j]) + lambda[j].Add(&lambda[j], &d) + } else { + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) + } + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[j].X) + rr.X.Sub(&rr.X, &P[j].X) + d.Sub(&R[j].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[j].Y) + R[j].Set(&rr) + } +} + +// batch inversion +// similar to BatchInvertfp.Element, ignores edge cases +func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { + + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < n; i++ { + res[i] = accumulator + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := n - 1; i >= 0; i-- { + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } +} diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go new file mode 100644 index 0000000000..79740b7a69 --- /dev/null +++ b/ecc/bw6-633/multiexp_affine.go @@ -0,0 +1,857 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package bw6633 + +import ( + "errors" + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bw6-633/fr" + "math" + "runtime" +) + +const MAX_BATCH_SIZE = 600 + +type batchOp struct { + bucketID, pointID uint32 +} + +func (o batchOp) isNeg() bool { + return o.pointID&1 == 1 +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { + var _p G1Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 8, 16} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]G1Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.msmC4(points, scalars, splitFirstChunk) + + case 5: + p.msmC5(points, scalars, splitFirstChunk) + + case 8: + p.msmC8(points, scalars, splitFirstChunk) + + case 16: + p.batchAffineMsmC16(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { + var _p g1JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + +type BatchG1Affine struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + buckets, points []G1Affine +} + +func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine { + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG1Affine{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(buckets)/2), + } +} + +func (b *BatchG1Affine) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG1Affine) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG1Affine) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG1Affine) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + B := &b.buckets[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if B.IsInfinity() { + if op.isNeg() { + B.Neg(P) + } else { + B.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if B.Equal(P) { + B.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { + B.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = B + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + +} + +func msmProcessChunkG1AffineBatchAffine(chunk uint64, + chRes chan<- g1JacExtended, + buckets []G1Affine, + c uint64, + points []G1Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + batch := newBatchG1Affine(buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG1Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. + } + + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 8, 16} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.msmC4(points, scalars, splitFirstChunk) + + case 5: + p.msmC5(points, scalars, splitFirstChunk) + + case 8: + p.msmC8(points, scalars, splitFirstChunk) + + case 16: + p.batchAffineMsmC16(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { + var _p g2JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + +type BatchG2Affine struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + buckets, points []G2Affine +} + +func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine { + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG2Affine{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(buckets)/2), + } +} + +func (b *BatchG2Affine) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG2Affine) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG2Affine) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG2Affine) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + B := &b.buckets[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if B.IsInfinity() { + if op.isNeg() { + B.Neg(P) + } else { + B.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if B.Equal(P) { + B.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { + B.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = B + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + +} + +func msmProcessChunkG2AffineBatchAffine(chunk uint64, + chRes chan<- g2JacExtended, + buckets []G2Affine, + c uint64, + points []G2Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + batch := newBatchG2Affine(buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG2Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. + } + + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index 401434b1bf..282b60e573 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -187,6 +187,39 @@ func TestMultiExpG1(t *testing.T) { genScalar, )) + properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // multi exp points + var samplePoints [nbSamples]G1Affine + var g G1Jac + g.Set(&g1Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g1Gen) + } + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + var result1, result2 G1Jac + for _, c := range cRange { + scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) + msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false) + msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + if !result1.Equal(&result2) { + return false + } + } + return true + }, + genScalar, + )) + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -245,12 +278,19 @@ func BenchmarkMultiExpG1(b *testing.B) { for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -478,6 +518,39 @@ func TestMultiExpG2(t *testing.T) { genScalar, )) + properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // multi exp points + var samplePoints [nbSamples]G2Affine + var g G2Jac + g.Set(&g2Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g2Gen) + } + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + var result1, result2 G2Jac + for _, c := range cRange { + scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) + msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false) + msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + if !result1.Equal(&result2) { + return false + } + } + return true + }, + genScalar, + )) + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -536,12 +609,19 @@ func BenchmarkMultiExpG2(b *testing.B) { for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bw6-756/g1.go b/ecc/bw6-756/g1.go index 6cc8b67f14..038e4f1b42 100644 --- a/ecc/bw6-756/g1.go +++ b/ecc/bw6-756/g1.go @@ -1083,3 +1083,77 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin toReturnAff := BatchJacobianToAffineG1(toReturn) return toReturnAff } + +// batch add/dbl in affine coordinates +// using batch inversion +// cost add: 5*batchSize M + 1I, dbl: +1M +func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { + if batchSize == 0 { + return + } + var isDbl [MAX_BATCH_SIZE]bool + var lambda [MAX_BATCH_SIZE]fp.Element + + { + var lambdain [MAX_BATCH_SIZE]fp.Element + + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + } + + // invert denominator + BatchInvertG1Affine(&lambda, &lambdain, batchSize) + + } + + var d fp.Element + var rr G1Affine + + for j := 0; j < batchSize; j++ { + // computa lambda, distinguishing dbl / add + if isDbl[j] { + d.Square(&P[j].X) + lambda[j].Mul(&lambda[j], &d) + d.Double(&lambda[j]) + lambda[j].Add(&lambda[j], &d) + } else { + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) + } + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[j].X) + rr.X.Sub(&rr.X, &P[j].X) + d.Sub(&R[j].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[j].Y) + R[j].Set(&rr) + } +} + +// batch inversion +// similar to BatchInvertfp.Element, ignores edge cases +func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { + + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < n; i++ { + res[i] = accumulator + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := n - 1; i >= 0; i-- { + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } +} diff --git a/ecc/bw6-756/g2.go b/ecc/bw6-756/g2.go index 171069cc4d..cb9fadd15d 100644 --- a/ecc/bw6-756/g2.go +++ b/ecc/bw6-756/g2.go @@ -940,3 +940,77 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin }) return toReturn } + +// batch add/dbl in affine coordinates +// using batch inversion +// cost add: 5*batchSize M + 1I, dbl: +1M +func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { + if batchSize == 0 { + return + } + var isDbl [MAX_BATCH_SIZE]bool + var lambda [MAX_BATCH_SIZE]fp.Element + + { + var lambdain [MAX_BATCH_SIZE]fp.Element + + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + } + + // invert denominator + BatchInvertG2Affine(&lambda, &lambdain, batchSize) + + } + + var d fp.Element + var rr G2Affine + + for j := 0; j < batchSize; j++ { + // computa lambda, distinguishing dbl / add + if isDbl[j] { + d.Square(&P[j].X) + lambda[j].Mul(&lambda[j], &d) + d.Double(&lambda[j]) + lambda[j].Add(&lambda[j], &d) + } else { + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) + } + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[j].X) + rr.X.Sub(&rr.X, &P[j].X) + d.Sub(&R[j].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[j].Y) + R[j].Set(&rr) + } +} + +// batch inversion +// similar to BatchInvertfp.Element, ignores edge cases +func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { + + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < n; i++ { + res[i] = accumulator + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := n - 1; i >= 0; i-- { + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } +} diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go new file mode 100644 index 0000000000..427f4d3891 --- /dev/null +++ b/ecc/bw6-756/multiexp_affine.go @@ -0,0 +1,857 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package bw6756 + +import ( + "errors" + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bw6-756/fr" + "math" + "runtime" +) + +const MAX_BATCH_SIZE = 600 + +type batchOp struct { + bucketID, pointID uint32 +} + +func (o batchOp) isNeg() bool { + return o.pointID&1 == 1 +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { + var _p G1Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 8, 16} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]G1Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.msmC4(points, scalars, splitFirstChunk) + + case 5: + p.msmC5(points, scalars, splitFirstChunk) + + case 8: + p.msmC8(points, scalars, splitFirstChunk) + + case 16: + p.batchAffineMsmC16(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { + var _p g1JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + +type BatchG1Affine struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + buckets, points []G1Affine +} + +func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine { + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG1Affine{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(buckets)/2), + } +} + +func (b *BatchG1Affine) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG1Affine) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG1Affine) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG1Affine) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + B := &b.buckets[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if B.IsInfinity() { + if op.isNeg() { + B.Neg(P) + } else { + B.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if B.Equal(P) { + B.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { + B.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = B + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + +} + +func msmProcessChunkG1AffineBatchAffine(chunk uint64, + chRes chan<- g1JacExtended, + buckets []G1Affine, + c uint64, + points []G1Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + batch := newBatchG1Affine(buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG1Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. + } + + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 8, 16} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.msmC4(points, scalars, splitFirstChunk) + + case 5: + p.msmC5(points, scalars, splitFirstChunk) + + case 8: + p.msmC8(points, scalars, splitFirstChunk) + + case 16: + p.batchAffineMsmC16(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { + var _p g2JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + +type BatchG2Affine struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + buckets, points []G2Affine +} + +func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine { + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG2Affine{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(buckets)/2), + } +} + +func (b *BatchG2Affine) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG2Affine) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG2Affine) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG2Affine) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + B := &b.buckets[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if B.IsInfinity() { + if op.isNeg() { + B.Neg(P) + } else { + B.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if B.Equal(P) { + B.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { + B.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = B + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + +} + +func msmProcessChunkG2AffineBatchAffine(chunk uint64, + chRes chan<- g2JacExtended, + buckets []G2Affine, + c uint64, + points []G2Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + batch := newBatchG2Affine(buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG2Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. + } + + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index 6fca10f2eb..d101a5c9a6 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -187,6 +187,39 @@ func TestMultiExpG1(t *testing.T) { genScalar, )) + properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // multi exp points + var samplePoints [nbSamples]G1Affine + var g G1Jac + g.Set(&g1Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g1Gen) + } + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + var result1, result2 G1Jac + for _, c := range cRange { + scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) + msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false) + msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + if !result1.Equal(&result2) { + return false + } + } + return true + }, + genScalar, + )) + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -245,12 +278,19 @@ func BenchmarkMultiExpG1(b *testing.B) { for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -478,6 +518,39 @@ func TestMultiExpG2(t *testing.T) { genScalar, )) + properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // multi exp points + var samplePoints [nbSamples]G2Affine + var g G2Jac + g.Set(&g2Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g2Gen) + } + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + var result1, result2 G2Jac + for _, c := range cRange { + scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) + msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false) + msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + if !result1.Equal(&result2) { + return false + } + } + return true + }, + genScalar, + )) + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -536,12 +609,19 @@ func BenchmarkMultiExpG2(b *testing.B) { for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bw6-761/g1.go b/ecc/bw6-761/g1.go index e940a54575..765d29433b 100644 --- a/ecc/bw6-761/g1.go +++ b/ecc/bw6-761/g1.go @@ -1094,3 +1094,77 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin toReturnAff := BatchJacobianToAffineG1(toReturn) return toReturnAff } + +// batch add/dbl in affine coordinates +// using batch inversion +// cost add: 5*batchSize M + 1I, dbl: +1M +func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { + if batchSize == 0 { + return + } + var isDbl [MAX_BATCH_SIZE]bool + var lambda [MAX_BATCH_SIZE]fp.Element + + { + var lambdain [MAX_BATCH_SIZE]fp.Element + + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + } + + // invert denominator + BatchInvertG1Affine(&lambda, &lambdain, batchSize) + + } + + var d fp.Element + var rr G1Affine + + for j := 0; j < batchSize; j++ { + // computa lambda, distinguishing dbl / add + if isDbl[j] { + d.Square(&P[j].X) + lambda[j].Mul(&lambda[j], &d) + d.Double(&lambda[j]) + lambda[j].Add(&lambda[j], &d) + } else { + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) + } + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[j].X) + rr.X.Sub(&rr.X, &P[j].X) + d.Sub(&R[j].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[j].Y) + R[j].Set(&rr) + } +} + +// batch inversion +// similar to BatchInvertfp.Element, ignores edge cases +func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { + + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < n; i++ { + res[i] = accumulator + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := n - 1; i >= 0; i-- { + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } +} diff --git a/ecc/bw6-761/g2.go b/ecc/bw6-761/g2.go index cef585280a..fdb98731d4 100644 --- a/ecc/bw6-761/g2.go +++ b/ecc/bw6-761/g2.go @@ -954,3 +954,77 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin }) return toReturn } + +// batch add/dbl in affine coordinates +// using batch inversion +// cost add: 5*batchSize M + 1I, dbl: +1M +func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { + if batchSize == 0 { + return + } + var isDbl [MAX_BATCH_SIZE]bool + var lambda [MAX_BATCH_SIZE]fp.Element + + { + var lambdain [MAX_BATCH_SIZE]fp.Element + + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + } + + // invert denominator + BatchInvertG2Affine(&lambda, &lambdain, batchSize) + + } + + var d fp.Element + var rr G2Affine + + for j := 0; j < batchSize; j++ { + // computa lambda, distinguishing dbl / add + if isDbl[j] { + d.Square(&P[j].X) + lambda[j].Mul(&lambda[j], &d) + d.Double(&lambda[j]) + lambda[j].Add(&lambda[j], &d) + } else { + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) + } + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[j].X) + rr.X.Sub(&rr.X, &P[j].X) + d.Sub(&R[j].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[j].Y) + R[j].Set(&rr) + } +} + +// batch inversion +// similar to BatchInvertfp.Element, ignores edge cases +func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { + + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < n; i++ { + res[i] = accumulator + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := n - 1; i >= 0; i-- { + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } +} diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go new file mode 100644 index 0000000000..09004ae309 --- /dev/null +++ b/ecc/bw6-761/multiexp_affine.go @@ -0,0 +1,857 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package bw6761 + +import ( + "errors" + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" + "math" + "runtime" +) + +const MAX_BATCH_SIZE = 600 + +type batchOp struct { + bucketID, pointID uint32 +} + +func (o batchOp) isNeg() bool { + return o.pointID&1 == 1 +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { + var _p G1Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 8, 16} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]G1Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.msmC4(points, scalars, splitFirstChunk) + + case 5: + p.msmC5(points, scalars, splitFirstChunk) + + case 8: + p.msmC8(points, scalars, splitFirstChunk) + + case 16: + p.batchAffineMsmC16(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { + var _p g1JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + +type BatchG1Affine struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + buckets, points []G1Affine +} + +func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine { + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG1Affine{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(buckets)/2), + } +} + +func (b *BatchG1Affine) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG1Affine) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG1Affine) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG1Affine) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + B := &b.buckets[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if B.IsInfinity() { + if op.isNeg() { + B.Neg(P) + } else { + B.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if B.Equal(P) { + B.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { + B.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = B + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + +} + +func msmProcessChunkG1AffineBatchAffine(chunk uint64, + chRes chan<- g1JacExtended, + buckets []G1Affine, + c uint64, + points []G1Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + batch := newBatchG1Affine(buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG1Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. + } + + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan g1JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { + var buckets [1 << (c - 1)]G1Affine + msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 8, 16} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.msmC4(points, scalars, splitFirstChunk) + + case 5: + p.msmC5(points, scalars, splitFirstChunk) + + case 8: + p.msmC8(points, scalars, splitFirstChunk) + + case 16: + p.batchAffineMsmC16(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { + var _p g2JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + +type BatchG2Affine struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + buckets, points []G2Affine +} + +func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine { + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG2Affine{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(buckets)/2), + } +} + +func (b *BatchG2Affine) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG2Affine) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG2Affine) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG2Affine) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + B := &b.buckets[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if B.IsInfinity() { + if op.isNeg() { + B.Neg(P) + } else { + B.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if B.Equal(P) { + B.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { + B.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = B + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + +} + +func msmProcessChunkG2AffineBatchAffine(chunk uint64, + chRes chan<- g2JacExtended, + buckets []G2Affine, + c uint64, + points []G2Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + batch := newBatchG2Affine(buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG2Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. + } + + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan g2JacExtended + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { + var buckets [1 << (c - 1)]G2Affine + msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) +} diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index 07de759142..d5b1288c1e 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -187,6 +187,39 @@ func TestMultiExpG1(t *testing.T) { genScalar, )) + properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // multi exp points + var samplePoints [nbSamples]G1Affine + var g G1Jac + g.Set(&g1Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g1Gen) + } + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + var result1, result2 G1Jac + for _, c := range cRange { + scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) + msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false) + msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + if !result1.Equal(&result2) { + return false + } + } + return true + }, + genScalar, + )) + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -245,12 +278,19 @@ func BenchmarkMultiExpG1(b *testing.B) { for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -478,6 +518,39 @@ func TestMultiExpG2(t *testing.T) { genScalar, )) + properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // multi exp points + var samplePoints [nbSamples]G2Affine + var g G2Jac + g.Set(&g2Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g2Gen) + } + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + var result1, result2 G2Jac + for _, c := range cRange { + scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) + msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false) + msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + if !result1.Equal(&result2) { + return false + } + } + return true + }, + genScalar, + )) + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -536,12 +609,19 @@ func BenchmarkMultiExpG2(b *testing.B) { for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/internal/generator/ecc/generate.go b/internal/generator/ecc/generate.go index 594e1343cd..bc367c5c14 100644 --- a/internal/generator/ecc/generate.go +++ b/internal/generator/ecc/generate.go @@ -15,6 +15,7 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er entries := []bavard.Entry{ {File: filepath.Join(baseDir, "multiexp.go"), Templates: []string{"multiexp.go.tmpl"}}, + {File: filepath.Join(baseDir, "multiexp_affine.go"), Templates: []string{"multiexp_affine.go.tmpl"}}, {File: filepath.Join(baseDir, "multiexp_test.go"), Templates: []string{"tests/multiexp.go.tmpl"}}, {File: filepath.Join(baseDir, "marshal.go"), Templates: []string{"marshal.go.tmpl"}}, {File: filepath.Join(baseDir, "marshal_test.go"), Templates: []string{"tests/marshal.go.tmpl"}}, diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl new file mode 100644 index 0000000000..02d8c72588 --- /dev/null +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -0,0 +1,474 @@ +{{ $G1TAffine := print (toUpper .G1.PointName) "Affine" }} +{{ $G1TJacobian := print (toUpper .G1.PointName) "Jac" }} +{{ $G1TJacobianExtended := print (toLower .G1.PointName) "JacExtended" }} + +{{ $G2TAffine := print (toUpper .G2.PointName) "Affine" }} +{{ $G2TJacobian := print (toUpper .G2.PointName) "Jac" }} +{{ $G2TJacobianExtended := print (toLower .G2.PointName) "JacExtended" }} + +import ( + "github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr" + "github.com/consensys/gnark-crypto/ecc" + "errors" + "math" + "runtime" +) + +const MAX_BATCH_SIZE = 600 + +type batchOp struct { + bucketID, pointID uint32 +} + +func (o batchOp) isNeg() bool { + return o.pointID&1 == 1 +} + + + +{{ template "multiexp" dict "PointName" .G1.PointName "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}} +{{ template "multiexp" dict "PointName" .G2.PointName "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange}} + + +{{define "multiexp" }} + + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *{{ $.TAffine }}) MultiExpBatchAffine(points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig) (*{{ $.TAffine }}, error) { + var _p {{$.TJacobian}} + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} + +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *{{ $.TJacobian }}) MultiExpBatchAffine(points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig) (*{{ $.TJacobian }}, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{ + {{- range $c := $.CRange}} {{- if and (eq $.PointName "g1") (gt $c 21)}}{{- else}} {{$c}},{{- end}}{{- end}} + } + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs * 64) % C != 0 { + nbChunks ++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInner{{ $.TJacobian }}BatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]{{ $.TJacobian }}, nbSplits - 1) + chDone := make(chan int, nbSplits - 1) + for i:=0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInner{{ $.TJacobian }}BatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInner{{ $.TJacobian }}BatchAffine(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + for i:=0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) + } + close(chDone) + return p, nil +} + +func msmInner{{ $.TJacobian }}BatchAffine(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + {{range $c := $.CRange}} + case {{$c}}: + {{- if le $c 9}} + p.msmC{{$c}}(points, scalars, splitFirstChunk) + {{- else}} + p.batchAffineMsmC{{$c}}(points, scalars, splitFirstChunk) + {{- end}} + {{end}} + default: + panic("not implemented") + } +} + +// msmReduceChunk{{ $.TAffine }}BatchAffine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunk{{ $.TAffine }}BatchAffine(p *{{ $.TJacobian }}, c int, chChunks []chan {{ $.TJacobianExtended }}) *{{ $.TJacobian }} { + var _p {{ $.TJacobianExtended }} + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + + +type Batch{{ $.TAffine }} struct { + P [MAX_BATCH_SIZE]{{ $.TAffine }} + R [MAX_BATCH_SIZE]*{{ $.TAffine }} + batchSize int + cptP int + bucketIds map[uint32]struct{} + buckets, points []{{ $.TAffine }} +} + +func newBatch{{ $.TAffine }}(buckets, points []{{ $.TAffine }}) Batch{{ $.TAffine }} { + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return Batch{{ $.TAffine }}{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(buckets)/2), + } +} + +func (b *Batch{{ $.TAffine }}) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *Batch{{ $.TAffine }}) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAdd{{ $.TAffine }}(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *Batch{{ $.TAffine }}) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *Batch{{ $.TAffine }}) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + B := &b.buckets[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if B.IsInfinity() { + if op.isNeg() { + B.Neg(P) + } else { + B.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if B.Equal(P) { + B.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { + B.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = B + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueue{{ $.TAffine }}(queue []batchOp, batch *Batch{{ $.TAffine }}) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + +} + +func msmProcessChunk{{ $.TAffine }}BatchAffine(chunk uint64, + chRes chan<- {{ $.TJacobianExtended }}, + buckets []{{ $.TAffine }}, + c uint64, + points []{{ $.TAffine }}, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64 %c)!=0 && s.shift > (64-c) && s.index < (fr.Limbs - 1 ) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + batch := newBatch{{ $.TAffine }}(buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueue{{ $.TAffine }}(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. + } + + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total {{ $.TJacobianExtended }} + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + + +{{range $c := $.CRange}} +{{- if gt $c 9}} +{{- $frBits := mul $.FrNbWords 64}} +{{- $cDividesBits := divides $c $frBits}} +{{- $nbChunks := div $frBits $c}} + +func (p *{{ $.TJacobian }}) batchAffineMsmC{{$c}}(points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) *{{ $.TJacobian }} { + const ( + c = {{$c}} // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks{{if not $cDividesBits }} + 1 {{end}} ]chan {{ $.TJacobianExtended }} + for i:=0; i < len(chChunks);i++ { + chChunks[i] = make(chan {{ $.TJacobianExtended }}, 1) + } + + + {{ if not $cDividesBits }} + + // c doesn't divide {{$frBits}}, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // TODO @gbotrel replace this in code generator + if lastC >= 10 { + go func(j uint64, points []{{ $.TAffine }}, scalars []fr.Element) { + var buckets [1<<(lastC-1)]{{ $.TAffine }} + msmProcessChunk{{ $.TAffine }}BatchAffine(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } else { + go func(j uint64, points []{{ $.TAffine }}, scalars []fr.Element) { + var buckets [1<<(lastC-1)]{{ $.TJacobianExtended }} + msmProcessChunk{{ $.TAffine }}(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + } + {{- end}} + + processChunk := func(j int, points []{{ $.TAffine }}, scalars []fr.Element, chChunk chan {{ $.TJacobianExtended }}) { + var buckets [1<<(c-1)]{{ $.TAffine }} + msmProcessChunk{{ $.TAffine }}BatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j >0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan {{ $.TJacobianExtended }}, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunk{{ $.TAffine }}BatchAffine(p, c, chChunks[:]) +} +{{- end}} +{{end}} + + +{{end }} diff --git a/internal/generator/ecc/template/point.go.tmpl b/internal/generator/ecc/template/point.go.tmpl index d7e8ea021d..bbc5ec8980 100644 --- a/internal/generator/ecc/template/point.go.tmpl +++ b/internal/generator/ecc/template/point.go.tmpl @@ -1568,3 +1568,82 @@ func BatchScalarMultiplication{{ toUpper .PointName }}(base *{{ $TAffine }}, sca return toReturn {{- end}} } + + + +// batch add/dbl in affine coordinates +// using batch inversion +// cost add: 5*batchSize M + 1I, dbl: +1M +func BatchAdd{{ $TAffine }}(R []*{{ $TAffine }}, P []{{ $TAffine }}, batchSize int) { + if batchSize == 0 { + return + } + var isDbl [MAX_BATCH_SIZE]bool + var lambda [MAX_BATCH_SIZE]{{.CoordType}} + + { + var lambdain [MAX_BATCH_SIZE]{{.CoordType}} + + + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + } + + // invert denominator + BatchInvert{{ $TAffine }}(&lambda, &lambdain, batchSize) + + } + + var d {{.CoordType}} + var rr {{ $TAffine }} + + for j := 0; j < batchSize; j++ { + // computa lambda, distinguishing dbl / add + if isDbl[j] { + d.Square(&P[j].X) + lambda[j].Mul(&lambda[j], &d) + d.Double(&lambda[j]) + lambda[j].Add(&lambda[j], &d) + } else { + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) + } + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[j].X) + rr.X.Sub(&rr.X, &P[j].X) + d.Sub(&R[j].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[j].Y) + R[j].Set(&rr) + } +} + + + +// batch inversion +// similar to BatchInvert{{.CoordType}}, ignores edge cases +func BatchInvert{{ $TAffine }}(res, a *[MAX_BATCH_SIZE]{{.CoordType}}, n int) { + + var accumulator {{.CoordType}} + accumulator.SetOne() + + for i := 0; i < n; i++ { + res[i] = accumulator + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := n - 1; i >= 0; i-- { + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } +} \ No newline at end of file diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index fdf2a82d8e..cde8bd0b2a 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -196,6 +196,40 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { )) + properties.Property(fmt.Sprintf("[{{ toUpper $.PointName }}] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // multi exp points + var samplePoints [nbSamples]{{ $.TAffine }} + var g {{ $.TJacobian }} + g.Set(&{{ toLower .PointName}}Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&{{ toLower .PointName}}Gen) + } + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + var result1, result2 {{ $.TJacobian }} + for _, c := range cRange { + scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) + msmInner{{ $.TJacobian }}(&result1, int(c), samplePoints[:], scalars, false) + msmInner{{ $.TJacobian }}BatchAffine(&result2, int(c), samplePoints[:], scalars, false) + if !result1.Equal(&result2) { + return false + } + } + return true + }, + genScalar, + )) + + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[{{ toUpper $.PointName }}] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -256,12 +290,19 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using],ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using],ecc.MultiExpConfig{}) + } + }) } } From 853bfb1f8edaa3cd4b54a3975b390694b8f98858 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Mon, 7 Nov 2022 16:13:00 -0600 Subject: [PATCH 02/43] build: updated go.mod to go 1.18 req --- go.mod | 2 +- go.sum | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/go.mod b/go.mod index 583ea31428..f1fd1fb56c 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/consensys/gnark-crypto -go 1.17 +go 1.18 require ( github.com/consensys/bavard v0.1.13 diff --git a/go.sum b/go.sum index 24019a2212..a0175604ce 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,3 @@ -github.com/consensys/bavard v0.1.12 h1:rApQlUvBg5FeW/fnigtVnAs0sBrgDN2pEuHNdWElSUE= -github.com/consensys/bavard v0.1.12/go.mod h1:9ItSMtA/dXMAiL7BG6bqW2m3NdSEObYWoH223nGHukI= github.com/consensys/bavard v0.1.13 h1:oLhMLOFGTLdlda/kma4VOJazblc7IM5y5QPd2A/YjhQ= github.com/consensys/bavard v0.1.13/go.mod h1:9ItSMtA/dXMAiL7BG6bqW2m3NdSEObYWoH223nGHukI= github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= @@ -28,15 +26,8 @@ github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PK github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa h1:zuSxTR4o9y82ebqCUJYNGJbGPo6sKVl54f/TVDObg1c= golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= -golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220727055044-e65921a090b8 h1:dyU22nBWzrmTQxtNrr4dzVOvaw35nUYE279vF9UmsI8= golang.org/x/sys v0.0.0-20220727055044-e65921a090b8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= From 9d170efdb5942e65487a7c706f6964411adcdcb3 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Mon, 7 Nov 2022 22:30:43 -0600 Subject: [PATCH 03/43] feat: started to factorize msm impl through generics --- ecc/bls12-377/multiexp.go | 1964 +++------------- ecc/bls12-377/multiexp_affine.go | 1986 ++++++----------- ecc/bls12-377/multiexp_test.go | 10 +- ecc/bls12-378/multiexp.go | 1964 +++------------- ecc/bls12-378/multiexp_affine.go | 1986 ++++++----------- ecc/bls12-378/multiexp_test.go | 10 +- ecc/bls12-381/multiexp.go | 1964 +++------------- ecc/bls12-381/multiexp_affine.go | 1986 ++++++----------- ecc/bls12-381/multiexp_test.go | 10 +- ecc/bls24-315/multiexp.go | 1964 +++------------- ecc/bls24-315/multiexp_affine.go | 1986 ++++++----------- ecc/bls24-315/multiexp_test.go | 10 +- ecc/bls24-317/multiexp.go | 1964 +++------------- ecc/bls24-317/multiexp_affine.go | 1986 ++++++----------- ecc/bls24-317/multiexp_test.go | 10 +- ecc/bn254/multiexp.go | 1964 +++------------- ecc/bn254/multiexp_affine.go | 1986 ++++++----------- ecc/bn254/multiexp_test.go | 10 +- ecc/bw6-633/multiexp.go | 460 ++-- ecc/bw6-633/multiexp_affine.go | 540 ++++- ecc/bw6-633/multiexp_test.go | 10 +- ecc/bw6-756/multiexp.go | 474 ++-- ecc/bw6-756/multiexp_affine.go | 540 ++++- ecc/bw6-756/multiexp_test.go | 10 +- ecc/bw6-761/multiexp.go | 474 ++-- ecc/bw6-761/multiexp_affine.go | 540 ++++- ecc/bw6-761/multiexp_test.go | 10 +- internal/generator/ecc/generate.go | 32 +- .../generator/ecc/template/multiexp.go.tmpl | 49 +- .../ecc/template/multiexp_affine.go.tmpl | 143 +- .../ecc/template/tests/multiexp.go.tmpl | 4 +- 31 files changed, 7524 insertions(+), 19522 deletions(-) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index 6d4f14f13b..0f487a104c 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -221,7 +221,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -294,50 +294,74 @@ func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, spl switch c { + case 1: + msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) case 6: - p.msmC6(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) case 7: - p.msmC7(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) case 9: - p.msmC9(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) case 10: - p.msmC10(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) case 11: - p.msmC11(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) case 12: - p.msmC12(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) case 13: - p.msmC13(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) case 14: - p.msmC14(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) case 15: - p.msmC15(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) case 16: - p.msmC16(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) + + case 18: + msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) + + case 19: + msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) case 20: - p.msmC20(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) case 21: - p.msmC21(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) + + case 22: + msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) + + case 23: + msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") @@ -360,9 +384,8 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG1Affine(chunk uint64, +func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, - buckets []g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element) { @@ -370,6 +393,7 @@ func msmProcessChunkG1Affine(chunk uint64, mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -424,26 +448,36 @@ func msmProcessChunkG1Affine(chunk uint64, } -func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 4 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - +func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended + chChunks := make([]chan g1JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g1JacExtended, 1) } + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G1Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g1JacExtended, 1<<(lastC-1)) + // TODO @gbotrel last C restore. + msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- + } + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -466,1719 +500,325 @@ func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk b }() } - return msmReduceChunkG1Affine(p, c, chChunks[:]) + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } -func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 5 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExp(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the msmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + // for each msmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented msmC methods (the c we use must be in this slice) + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } } - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) -func (p *G1Jac) msmC6(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 6 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) + msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) } + close(chDone) + return p, nil +} - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) +func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + switch c { - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 1: + msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 2: + msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + case 3: + msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) -func (p *G1Jac) msmC7(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 7 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 4: + msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 5: + msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 6: + msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + case 7: + msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 8: + msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 9: + msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 10: + msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + case 11: + msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) -func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 8 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 12: + msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 13: + msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 14: + msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 15: + msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 16: + msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 17: + msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + case 18: + msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) -func (p *G1Jac) msmC9(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 9 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 19: + msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExp(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the msmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each msmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 4: - p.msmC4(points, scalars, splitFirstChunk) - - case 5: - p.msmC5(points, scalars, splitFirstChunk) - - case 6: - p.msmC6(points, scalars, splitFirstChunk) - - case 7: - p.msmC7(points, scalars, splitFirstChunk) - - case 8: - p.msmC8(points, scalars, splitFirstChunk) - - case 9: - p.msmC9(points, scalars, splitFirstChunk) - - case 10: - p.msmC10(points, scalars, splitFirstChunk) - - case 11: - p.msmC11(points, scalars, splitFirstChunk) - - case 12: - p.msmC12(points, scalars, splitFirstChunk) - - case 13: - p.msmC13(points, scalars, splitFirstChunk) - - case 14: - p.msmC14(points, scalars, splitFirstChunk) - - case 15: - p.msmC15(points, scalars, splitFirstChunk) - - case 16: - p.msmC16(points, scalars, splitFirstChunk) - - case 20: - p.msmC20(points, scalars, splitFirstChunk) - - case 21: - p.msmC21(points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } -} - -// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { - var _p g2JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -func msmProcessChunkG2Affine(chunk uint64, - chRes chan<- g2JacExtended, - buckets []g2JacExtended, - c uint64, - points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) - - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) - } - } - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 4 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 5 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC6(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 6 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC7(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 7 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 8 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC9(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 9 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + case 20: + msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + case 21: + msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 22: + msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 23: + msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + default: + panic("not implemented") } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) } -func (p *G2Jac) msmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { + var _p g2JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) } - return msmReduceChunkG2Affine(p, c, chChunks[:]) + return p.unsafeFromJacExtended(&_p) } -func (p *G2Jac) msmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) +func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) } - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + if bits == 0 { + continue + } - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + chRes <- total - return msmReduceChunkG2Affine(p, c, chChunks[:]) } -func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended + chChunks := make([]chan g2JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g2JacExtended, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G2Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g2JacExtended, 1<<(lastC-1)) + // TODO @gbotrel last C restore. + msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -2201,5 +841,5 @@ func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk }() } - return msmReduceChunkG2Affine(p, c, chChunks[:]) + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index 876ae12f01..8ff827c422 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -93,7 +93,7 @@ func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, con // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -166,102 +166,111 @@ func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.E switch c { + case 1: + msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) case 6: - p.msmC6(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) case 7: - p.msmC7(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) case 9: - p.msmC9(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) case 10: - p.batchAffineMsmC10(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) case 11: - p.batchAffineMsmC11(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) case 12: - p.batchAffineMsmC12(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) case 13: - p.batchAffineMsmC13(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) case 14: - p.batchAffineMsmC14(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) case 15: - p.batchAffineMsmC15(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) case 16: - p.batchAffineMsmC16(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) + + case 18: + batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) + + case 19: + batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) case 20: - p.batchAffineMsmC20(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) case 21: - p.batchAffineMsmC21(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) + + case 22: + batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) + + case 23: + batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") } } -// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { - var _p g1JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -type BatchG1Affine struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - buckets, points []G1Affine +type BatchG1Affine[B ibG1Affine] struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G1Affine + buckets *B } -func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine { - batchSize := len(buckets) / 5 +func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { + batchSize := len(*buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG1Affine{ + return BatchG1Affine[B]{ buckets: buckets, points: points, batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(buckets)/2), + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), } } -func (b *BatchG1Affine) IsFull() bool { +func (b *BatchG1Affine[B]) IsFull() bool { return b.cptP == b.batchSize } -func (b *BatchG1Affine) ExecuteAndReset() { +func (b *BatchG1Affine[B]) ExecuteAndReset() { if b.cptP == 0 { return } @@ -276,45 +285,45 @@ func (b *BatchG1Affine) ExecuteAndReset() { b.cptP = 0 } -func (b *BatchG1Affine) CanAdd(bID uint32) bool { +func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { _, ok := b.bucketIds[bID] return !ok } -func (b *BatchG1Affine) Add(op batchOp) { +func (b *BatchG1Affine[B]) Add(op batchOp) { // CanAdd must be called before --> ensures bucket is not "used" in current batch - B := &b.buckets[op.bucketID] + BK := &(*b.buckets)[op.bucketID] P := &b.points[op.pointID>>1] if P.IsInfinity() { return } // handle special cases with inf or -P / P - if B.IsInfinity() { + if BK.IsInfinity() { if op.isNeg() { - B.Neg(P) + BK.Neg(P) } else { - B.Set(P) + BK.Set(P) } return } if op.isNeg() { // if bucket == P --> -P == 0 - if B.Equal(P) { - B.setInfinity() + if BK.Equal(P) { + BK.setInfinity() return } } else { // if bucket == -P, B == 0 - if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { - B.setInfinity() + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() return } } // b.bucketIds[b.cptP] = op.bucketID b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = B + b.R[b.cptP] = BK if op.isNeg() { b.P[b.cptP].Neg(P) } else { @@ -323,7 +332,7 @@ func (b *BatchG1Affine) Add(op batchOp) { b.cptP++ } -func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { +func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { for i := len(queue) - 1; i >= 0; i-- { if batch.CanAdd(queue[i].bucketID) { batch.Add(queue[i]) @@ -338,16 +347,15 @@ func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { } -func msmProcessChunkG1AffineBatchAffine(chunk uint64, +func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, - buckets []G1Affine, c uint64, points []G1Affine, scalars []fr.Element) { mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) - + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -364,7 +372,7 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64, s.shiftHigh = (c - nbBitsHigh) } - batch := newBatchG1Affine(buckets, points) + batch := newBatchG1Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 for i := 0; i < len(scalars); i++ { @@ -433,11 +441,12 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64, } -func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -445,29 +454,25 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended + chChunks := make([]chan g1JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g1JacExtended, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g1JacExtended, 1<<(lastC-1)) + // TODO @gbotrel lastC restore. + msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -490,1343 +495,521 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split }() } - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } -func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +type bucketG1AffineC1 [1 << (1 - 1)]G1Affine +type bucketG1AffineC2 [1 << (2 - 1)]G1Affine +type bucketG1AffineC3 [1 << (3 - 1)]G1Affine +type bucketG1AffineC4 [1 << (4 - 1)]G1Affine +type bucketG1AffineC5 [1 << (5 - 1)]G1Affine +type bucketG1AffineC6 [1 << (6 - 1)]G1Affine +type bucketG1AffineC7 [1 << (7 - 1)]G1Affine +type bucketG1AffineC8 [1 << (8 - 1)]G1Affine +type bucketG1AffineC9 [1 << (9 - 1)]G1Affine +type bucketG1AffineC10 [1 << (10 - 1)]G1Affine +type bucketG1AffineC11 [1 << (11 - 1)]G1Affine +type bucketG1AffineC12 [1 << (12 - 1)]G1Affine +type bucketG1AffineC13 [1 << (13 - 1)]G1Affine +type bucketG1AffineC14 [1 << (14 - 1)]G1Affine +type bucketG1AffineC15 [1 << (15 - 1)]G1Affine +type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +type bucketG1AffineC17 [1 << (17 - 1)]G1Affine +type bucketG1AffineC18 [1 << (18 - 1)]G1Affine +type bucketG1AffineC19 [1 << (19 - 1)]G1Affine +type bucketG1AffineC20 [1 << (20 - 1)]G1Affine +type bucketG1AffineC21 [1 << (21 - 1)]G1Affine +type bucketG1AffineC22 [1 << (22 - 1)]G1Affine +type bucketG1AffineC23 [1 << (23 - 1)]G1Affine +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended +type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended +type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended +type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended +type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended +type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended +type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended +type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended +type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended +type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended +type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended +type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended +type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended +type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended +type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended +type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended +type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended +type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended +type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended + +type ibG1Affine interface { + bucketG1AffineC1 | + bucketG1AffineC2 | + bucketG1AffineC3 | + bucketG1AffineC4 | + bucketG1AffineC5 | + bucketG1AffineC6 | + bucketG1AffineC7 | + bucketG1AffineC8 | + bucketG1AffineC9 | + bucketG1AffineC10 | + bucketG1AffineC11 | + bucketG1AffineC12 | + bucketG1AffineC13 | + bucketG1AffineC14 | + bucketG1AffineC15 | + bucketG1AffineC16 | + bucketG1AffineC17 | + bucketG1AffineC18 | + bucketG1AffineC19 | + bucketG1AffineC20 | + bucketG1AffineC21 | + bucketG1AffineC22 | + bucketG1AffineC23 +} - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +type ibg1JacExtended interface { + bucketg1JacExtendedC1 | + bucketg1JacExtendedC2 | + bucketg1JacExtendedC3 | + bucketg1JacExtendedC4 | + bucketg1JacExtendedC5 | + bucketg1JacExtendedC6 | + bucketg1JacExtendedC7 | + bucketg1JacExtendedC8 | + bucketg1JacExtendedC9 | + bucketg1JacExtendedC10 | + bucketg1JacExtendedC11 | + bucketg1JacExtendedC12 | + bucketg1JacExtendedC13 | + bucketg1JacExtendedC14 | + bucketg1JacExtendedC15 | + bucketg1JacExtendedC16 | + bucketg1JacExtendedC17 | + bucketg1JacExtendedC18 | + bucketg1JacExtendedC19 | + bucketg1JacExtendedC20 | + bucketg1JacExtendedC21 | + bucketg1JacExtendedC22 | + bucketg1JacExtendedC23 +} - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err } + p.FromJacobian(&_p) + return p, nil +} - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C } - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } -func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) } + close(chDone) + return p, nil +} - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } +func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + switch c { - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 1: + msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} + case 2: + msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) -func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 3: + msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 4: + msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 5: + msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } + case 6: + msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 7: + msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 8: + msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 9: + msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} + case 10: + batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) -func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 11: + batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 12: + batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 13: + batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } + case 14: + batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 15: + batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 16: + batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 4: - p.msmC4(points, scalars, splitFirstChunk) - - case 5: - p.msmC5(points, scalars, splitFirstChunk) - - case 6: - p.msmC6(points, scalars, splitFirstChunk) - - case 7: - p.msmC7(points, scalars, splitFirstChunk) - - case 8: - p.msmC8(points, scalars, splitFirstChunk) - - case 9: - p.msmC9(points, scalars, splitFirstChunk) - - case 10: - p.batchAffineMsmC10(points, scalars, splitFirstChunk) - - case 11: - p.batchAffineMsmC11(points, scalars, splitFirstChunk) - - case 12: - p.batchAffineMsmC12(points, scalars, splitFirstChunk) - - case 13: - p.batchAffineMsmC13(points, scalars, splitFirstChunk) - - case 14: - p.batchAffineMsmC14(points, scalars, splitFirstChunk) - - case 15: - p.batchAffineMsmC15(points, scalars, splitFirstChunk) - - case 16: - p.batchAffineMsmC16(points, scalars, splitFirstChunk) - - case 20: - p.batchAffineMsmC20(points, scalars, splitFirstChunk) - - case 21: - p.batchAffineMsmC21(points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } -} - -// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { - var _p g2JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -type BatchG2Affine struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - buckets, points []G2Affine -} - -func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine { - batchSize := len(buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG2Affine{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(buckets)/2), - } -} - -func (b *BatchG2Affine) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG2Affine) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG2Affine) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG2Affine) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - B := &b.buckets[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if B.IsInfinity() { - if op.isNeg() { - B.Neg(P) - } else { - B.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if B.Equal(P) { - B.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { - B.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = B - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} - -func msmProcessChunkG2AffineBatchAffine(chunk uint64, - chRes chan<- g2JacExtended, - buckets []G2Affine, - c uint64, - points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) - - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - batch := newBatchG2Affine(buckets, points) - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. - nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - op.bucketID = uint32(bits - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) - } else { - // sub - op.bucketID = (uint32(bits & ^msbWindow)) - op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) - } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() - nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - } - } else { - // put it in queue. - queue = append(queue, op) - } - } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() - for len(queue) != 0 { - queue = processQueueG2Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. - } - - // flush items in batch. - batch.ExecuteAndReset() - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { - runningSum.addMixed(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 17: + batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 18: + batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 19: + batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + case 20: + batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } + case 21: + batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 22: + batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 23: + batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + default: + panic("not implemented") } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) } -func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +type BatchG2Affine[B ibG2Affine] struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G2Affine + buckets *B +} - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) +func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { + batchSize := len(*buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + if batchSize <= 0 { + batchSize = 1 } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + return BatchG2Affine[B]{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), } +} - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } +func (b *BatchG2Affine[B]) IsFull() bool { + return b.cptP == b.batchSize +} - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +func (b *BatchG2Affine[B]) ExecuteAndReset() { + if b.cptP == 0 { + return } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 } -func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +func (b *BatchG2Affine[B]) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + BK := &(*b.buckets)[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(P) + } else { + BK.Set(P) + } + return } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(P) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() + return + } } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = BK + if op.isNeg() { + b.P[b.cptP].Neg(P) } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + b.P[b.cptP].Set(P) } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + b.cptP++ } -func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +} - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } +func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() } - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + batch := newBatchG2Affine(&buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG2Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) } - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + chRes <- total + } -func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -1834,29 +1017,25 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended + chChunks := make([]chan g2JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g2JacExtended, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g2JacExtended, 1<<(lastC-1)) + // TODO @gbotrel lastC restore. + msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -1879,5 +1058,104 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split }() } - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) +} + +type bucketG2AffineC1 [1 << (1 - 1)]G2Affine +type bucketG2AffineC2 [1 << (2 - 1)]G2Affine +type bucketG2AffineC3 [1 << (3 - 1)]G2Affine +type bucketG2AffineC4 [1 << (4 - 1)]G2Affine +type bucketG2AffineC5 [1 << (5 - 1)]G2Affine +type bucketG2AffineC6 [1 << (6 - 1)]G2Affine +type bucketG2AffineC7 [1 << (7 - 1)]G2Affine +type bucketG2AffineC8 [1 << (8 - 1)]G2Affine +type bucketG2AffineC9 [1 << (9 - 1)]G2Affine +type bucketG2AffineC10 [1 << (10 - 1)]G2Affine +type bucketG2AffineC11 [1 << (11 - 1)]G2Affine +type bucketG2AffineC12 [1 << (12 - 1)]G2Affine +type bucketG2AffineC13 [1 << (13 - 1)]G2Affine +type bucketG2AffineC14 [1 << (14 - 1)]G2Affine +type bucketG2AffineC15 [1 << (15 - 1)]G2Affine +type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +type bucketG2AffineC17 [1 << (17 - 1)]G2Affine +type bucketG2AffineC18 [1 << (18 - 1)]G2Affine +type bucketG2AffineC19 [1 << (19 - 1)]G2Affine +type bucketG2AffineC20 [1 << (20 - 1)]G2Affine +type bucketG2AffineC21 [1 << (21 - 1)]G2Affine +type bucketG2AffineC22 [1 << (22 - 1)]G2Affine +type bucketG2AffineC23 [1 << (23 - 1)]G2Affine +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended +type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended +type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended +type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended +type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended +type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended +type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended +type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended +type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended +type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended +type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended +type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended +type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended +type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended +type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended +type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended +type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended +type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended +type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended + +type ibG2Affine interface { + bucketG2AffineC1 | + bucketG2AffineC2 | + bucketG2AffineC3 | + bucketG2AffineC4 | + bucketG2AffineC5 | + bucketG2AffineC6 | + bucketG2AffineC7 | + bucketG2AffineC8 | + bucketG2AffineC9 | + bucketG2AffineC10 | + bucketG2AffineC11 | + bucketG2AffineC12 | + bucketG2AffineC13 | + bucketG2AffineC14 | + bucketG2AffineC15 | + bucketG2AffineC16 | + bucketG2AffineC17 | + bucketG2AffineC18 | + bucketG2AffineC19 | + bucketG2AffineC20 | + bucketG2AffineC21 | + bucketG2AffineC22 | + bucketG2AffineC23 +} + +type ibg2JacExtended interface { + bucketg2JacExtendedC1 | + bucketg2JacExtendedC2 | + bucketg2JacExtendedC3 | + bucketg2JacExtendedC4 | + bucketg2JacExtendedC5 | + bucketg2JacExtendedC6 | + bucketg2JacExtendedC7 | + bucketg2JacExtendedC8 | + bucketg2JacExtendedC9 | + bucketg2JacExtendedC10 | + bucketg2JacExtendedC11 | + bucketg2JacExtendedC12 | + bucketg2JacExtendedC13 | + bucketg2JacExtendedC14 | + bucketg2JacExtendedC15 | + bucketg2JacExtendedC16 | + bucketg2JacExtendedC17 | + bucketg2JacExtendedC18 | + bucketg2JacExtendedC19 | + bucketg2JacExtendedC20 | + bucketg2JacExtendedC21 | + bucketg2JacExtendedC22 | + bucketg2JacExtendedC23 } diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index 9e40c04401..a14b7946f2 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - r16.msmC16(samplePoints[:], scalars16, true) + msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 5; i <= pow; i++ { + for i := 11; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - r16.msmC16(samplePoints[:], scalars16, true) + msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 5; i <= pow; i++ { + for i := 11; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index b42536f1b9..ebc19dc090 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -221,7 +221,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -294,50 +294,74 @@ func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, spl switch c { + case 1: + msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) case 6: - p.msmC6(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) case 7: - p.msmC7(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) case 9: - p.msmC9(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) case 10: - p.msmC10(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) case 11: - p.msmC11(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) case 12: - p.msmC12(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) case 13: - p.msmC13(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) case 14: - p.msmC14(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) case 15: - p.msmC15(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) case 16: - p.msmC16(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) + + case 18: + msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) + + case 19: + msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) case 20: - p.msmC20(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) case 21: - p.msmC21(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) + + case 22: + msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) + + case 23: + msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") @@ -360,9 +384,8 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG1Affine(chunk uint64, +func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, - buckets []g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element) { @@ -370,6 +393,7 @@ func msmProcessChunkG1Affine(chunk uint64, mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -424,26 +448,36 @@ func msmProcessChunkG1Affine(chunk uint64, } -func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 4 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - +func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended + chChunks := make([]chan g1JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g1JacExtended, 1) } + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G1Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g1JacExtended, 1<<(lastC-1)) + // TODO @gbotrel last C restore. + msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- + } + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -466,1719 +500,325 @@ func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk b }() } - return msmReduceChunkG1Affine(p, c, chChunks[:]) + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } -func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 5 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExp(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the msmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + // for each msmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented msmC methods (the c we use must be in this slice) + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } } - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) -func (p *G1Jac) msmC6(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 6 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) + msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) } + close(chDone) + return p, nil +} - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) +func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + switch c { - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 1: + msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 2: + msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + case 3: + msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) -func (p *G1Jac) msmC7(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 7 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 4: + msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 5: + msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 6: + msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + case 7: + msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 8: + msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 9: + msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 10: + msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + case 11: + msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) -func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 8 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 12: + msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 13: + msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 14: + msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 15: + msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 16: + msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 17: + msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + case 18: + msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) -func (p *G1Jac) msmC9(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 9 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 19: + msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExp(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the msmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each msmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 4: - p.msmC4(points, scalars, splitFirstChunk) - - case 5: - p.msmC5(points, scalars, splitFirstChunk) - - case 6: - p.msmC6(points, scalars, splitFirstChunk) - - case 7: - p.msmC7(points, scalars, splitFirstChunk) - - case 8: - p.msmC8(points, scalars, splitFirstChunk) - - case 9: - p.msmC9(points, scalars, splitFirstChunk) - - case 10: - p.msmC10(points, scalars, splitFirstChunk) - - case 11: - p.msmC11(points, scalars, splitFirstChunk) - - case 12: - p.msmC12(points, scalars, splitFirstChunk) - - case 13: - p.msmC13(points, scalars, splitFirstChunk) - - case 14: - p.msmC14(points, scalars, splitFirstChunk) - - case 15: - p.msmC15(points, scalars, splitFirstChunk) - - case 16: - p.msmC16(points, scalars, splitFirstChunk) - - case 20: - p.msmC20(points, scalars, splitFirstChunk) - - case 21: - p.msmC21(points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } -} - -// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { - var _p g2JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -func msmProcessChunkG2Affine(chunk uint64, - chRes chan<- g2JacExtended, - buckets []g2JacExtended, - c uint64, - points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) - - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) - } - } - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 4 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 5 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC6(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 6 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC7(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 7 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 8 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC9(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 9 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + case 20: + msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + case 21: + msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 22: + msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 23: + msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + default: + panic("not implemented") } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) } -func (p *G2Jac) msmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { + var _p g2JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) } - return msmReduceChunkG2Affine(p, c, chChunks[:]) + return p.unsafeFromJacExtended(&_p) } -func (p *G2Jac) msmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) +func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) } - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + if bits == 0 { + continue + } - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + chRes <- total - return msmReduceChunkG2Affine(p, c, chChunks[:]) } -func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended + chChunks := make([]chan g2JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g2JacExtended, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G2Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g2JacExtended, 1<<(lastC-1)) + // TODO @gbotrel last C restore. + msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -2201,5 +841,5 @@ func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk }() } - return msmReduceChunkG2Affine(p, c, chChunks[:]) + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index b92b826e91..4becea22f3 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -93,7 +93,7 @@ func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, con // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -166,102 +166,111 @@ func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.E switch c { + case 1: + msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) case 6: - p.msmC6(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) case 7: - p.msmC7(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) case 9: - p.msmC9(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) case 10: - p.batchAffineMsmC10(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) case 11: - p.batchAffineMsmC11(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) case 12: - p.batchAffineMsmC12(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) case 13: - p.batchAffineMsmC13(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) case 14: - p.batchAffineMsmC14(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) case 15: - p.batchAffineMsmC15(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) case 16: - p.batchAffineMsmC16(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) + + case 18: + batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) + + case 19: + batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) case 20: - p.batchAffineMsmC20(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) case 21: - p.batchAffineMsmC21(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) + + case 22: + batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) + + case 23: + batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") } } -// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { - var _p g1JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -type BatchG1Affine struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - buckets, points []G1Affine +type BatchG1Affine[B ibG1Affine] struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G1Affine + buckets *B } -func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine { - batchSize := len(buckets) / 5 +func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { + batchSize := len(*buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG1Affine{ + return BatchG1Affine[B]{ buckets: buckets, points: points, batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(buckets)/2), + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), } } -func (b *BatchG1Affine) IsFull() bool { +func (b *BatchG1Affine[B]) IsFull() bool { return b.cptP == b.batchSize } -func (b *BatchG1Affine) ExecuteAndReset() { +func (b *BatchG1Affine[B]) ExecuteAndReset() { if b.cptP == 0 { return } @@ -276,45 +285,45 @@ func (b *BatchG1Affine) ExecuteAndReset() { b.cptP = 0 } -func (b *BatchG1Affine) CanAdd(bID uint32) bool { +func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { _, ok := b.bucketIds[bID] return !ok } -func (b *BatchG1Affine) Add(op batchOp) { +func (b *BatchG1Affine[B]) Add(op batchOp) { // CanAdd must be called before --> ensures bucket is not "used" in current batch - B := &b.buckets[op.bucketID] + BK := &(*b.buckets)[op.bucketID] P := &b.points[op.pointID>>1] if P.IsInfinity() { return } // handle special cases with inf or -P / P - if B.IsInfinity() { + if BK.IsInfinity() { if op.isNeg() { - B.Neg(P) + BK.Neg(P) } else { - B.Set(P) + BK.Set(P) } return } if op.isNeg() { // if bucket == P --> -P == 0 - if B.Equal(P) { - B.setInfinity() + if BK.Equal(P) { + BK.setInfinity() return } } else { // if bucket == -P, B == 0 - if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { - B.setInfinity() + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() return } } // b.bucketIds[b.cptP] = op.bucketID b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = B + b.R[b.cptP] = BK if op.isNeg() { b.P[b.cptP].Neg(P) } else { @@ -323,7 +332,7 @@ func (b *BatchG1Affine) Add(op batchOp) { b.cptP++ } -func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { +func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { for i := len(queue) - 1; i >= 0; i-- { if batch.CanAdd(queue[i].bucketID) { batch.Add(queue[i]) @@ -338,16 +347,15 @@ func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { } -func msmProcessChunkG1AffineBatchAffine(chunk uint64, +func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, - buckets []G1Affine, c uint64, points []G1Affine, scalars []fr.Element) { mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) - + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -364,7 +372,7 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64, s.shiftHigh = (c - nbBitsHigh) } - batch := newBatchG1Affine(buckets, points) + batch := newBatchG1Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 for i := 0; i < len(scalars); i++ { @@ -433,11 +441,12 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64, } -func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -445,29 +454,25 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended + chChunks := make([]chan g1JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g1JacExtended, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g1JacExtended, 1<<(lastC-1)) + // TODO @gbotrel lastC restore. + msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -490,1343 +495,521 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split }() } - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } -func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +type bucketG1AffineC1 [1 << (1 - 1)]G1Affine +type bucketG1AffineC2 [1 << (2 - 1)]G1Affine +type bucketG1AffineC3 [1 << (3 - 1)]G1Affine +type bucketG1AffineC4 [1 << (4 - 1)]G1Affine +type bucketG1AffineC5 [1 << (5 - 1)]G1Affine +type bucketG1AffineC6 [1 << (6 - 1)]G1Affine +type bucketG1AffineC7 [1 << (7 - 1)]G1Affine +type bucketG1AffineC8 [1 << (8 - 1)]G1Affine +type bucketG1AffineC9 [1 << (9 - 1)]G1Affine +type bucketG1AffineC10 [1 << (10 - 1)]G1Affine +type bucketG1AffineC11 [1 << (11 - 1)]G1Affine +type bucketG1AffineC12 [1 << (12 - 1)]G1Affine +type bucketG1AffineC13 [1 << (13 - 1)]G1Affine +type bucketG1AffineC14 [1 << (14 - 1)]G1Affine +type bucketG1AffineC15 [1 << (15 - 1)]G1Affine +type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +type bucketG1AffineC17 [1 << (17 - 1)]G1Affine +type bucketG1AffineC18 [1 << (18 - 1)]G1Affine +type bucketG1AffineC19 [1 << (19 - 1)]G1Affine +type bucketG1AffineC20 [1 << (20 - 1)]G1Affine +type bucketG1AffineC21 [1 << (21 - 1)]G1Affine +type bucketG1AffineC22 [1 << (22 - 1)]G1Affine +type bucketG1AffineC23 [1 << (23 - 1)]G1Affine +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended +type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended +type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended +type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended +type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended +type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended +type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended +type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended +type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended +type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended +type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended +type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended +type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended +type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended +type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended +type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended +type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended +type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended +type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended + +type ibG1Affine interface { + bucketG1AffineC1 | + bucketG1AffineC2 | + bucketG1AffineC3 | + bucketG1AffineC4 | + bucketG1AffineC5 | + bucketG1AffineC6 | + bucketG1AffineC7 | + bucketG1AffineC8 | + bucketG1AffineC9 | + bucketG1AffineC10 | + bucketG1AffineC11 | + bucketG1AffineC12 | + bucketG1AffineC13 | + bucketG1AffineC14 | + bucketG1AffineC15 | + bucketG1AffineC16 | + bucketG1AffineC17 | + bucketG1AffineC18 | + bucketG1AffineC19 | + bucketG1AffineC20 | + bucketG1AffineC21 | + bucketG1AffineC22 | + bucketG1AffineC23 +} - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +type ibg1JacExtended interface { + bucketg1JacExtendedC1 | + bucketg1JacExtendedC2 | + bucketg1JacExtendedC3 | + bucketg1JacExtendedC4 | + bucketg1JacExtendedC5 | + bucketg1JacExtendedC6 | + bucketg1JacExtendedC7 | + bucketg1JacExtendedC8 | + bucketg1JacExtendedC9 | + bucketg1JacExtendedC10 | + bucketg1JacExtendedC11 | + bucketg1JacExtendedC12 | + bucketg1JacExtendedC13 | + bucketg1JacExtendedC14 | + bucketg1JacExtendedC15 | + bucketg1JacExtendedC16 | + bucketg1JacExtendedC17 | + bucketg1JacExtendedC18 | + bucketg1JacExtendedC19 | + bucketg1JacExtendedC20 | + bucketg1JacExtendedC21 | + bucketg1JacExtendedC22 | + bucketg1JacExtendedC23 +} - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err } + p.FromJacobian(&_p) + return p, nil +} - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C } - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } -func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) } + close(chDone) + return p, nil +} - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } +func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + switch c { - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 1: + msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} + case 2: + msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) -func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 3: + msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 4: + msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 5: + msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } + case 6: + msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 7: + msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 8: + msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 9: + msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} + case 10: + batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) -func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 11: + batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 12: + batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 13: + batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } + case 14: + batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 15: + batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 16: + batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 4: - p.msmC4(points, scalars, splitFirstChunk) - - case 5: - p.msmC5(points, scalars, splitFirstChunk) - - case 6: - p.msmC6(points, scalars, splitFirstChunk) - - case 7: - p.msmC7(points, scalars, splitFirstChunk) - - case 8: - p.msmC8(points, scalars, splitFirstChunk) - - case 9: - p.msmC9(points, scalars, splitFirstChunk) - - case 10: - p.batchAffineMsmC10(points, scalars, splitFirstChunk) - - case 11: - p.batchAffineMsmC11(points, scalars, splitFirstChunk) - - case 12: - p.batchAffineMsmC12(points, scalars, splitFirstChunk) - - case 13: - p.batchAffineMsmC13(points, scalars, splitFirstChunk) - - case 14: - p.batchAffineMsmC14(points, scalars, splitFirstChunk) - - case 15: - p.batchAffineMsmC15(points, scalars, splitFirstChunk) - - case 16: - p.batchAffineMsmC16(points, scalars, splitFirstChunk) - - case 20: - p.batchAffineMsmC20(points, scalars, splitFirstChunk) - - case 21: - p.batchAffineMsmC21(points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } -} - -// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { - var _p g2JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -type BatchG2Affine struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - buckets, points []G2Affine -} - -func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine { - batchSize := len(buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG2Affine{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(buckets)/2), - } -} - -func (b *BatchG2Affine) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG2Affine) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG2Affine) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG2Affine) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - B := &b.buckets[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if B.IsInfinity() { - if op.isNeg() { - B.Neg(P) - } else { - B.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if B.Equal(P) { - B.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { - B.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = B - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} - -func msmProcessChunkG2AffineBatchAffine(chunk uint64, - chRes chan<- g2JacExtended, - buckets []G2Affine, - c uint64, - points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) - - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - batch := newBatchG2Affine(buckets, points) - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. - nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - op.bucketID = uint32(bits - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) - } else { - // sub - op.bucketID = (uint32(bits & ^msbWindow)) - op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) - } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() - nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - } - } else { - // put it in queue. - queue = append(queue, op) - } - } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() - for len(queue) != 0 { - queue = processQueueG2Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. - } - - // flush items in batch. - batch.ExecuteAndReset() - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { - runningSum.addMixed(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 17: + batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 18: + batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 19: + batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + case 20: + batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } + case 21: + batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 22: + batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 23: + batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + default: + panic("not implemented") } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) } -func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +type BatchG2Affine[B ibG2Affine] struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G2Affine + buckets *B +} - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) +func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { + batchSize := len(*buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + if batchSize <= 0 { + batchSize = 1 } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + return BatchG2Affine[B]{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), } +} - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } +func (b *BatchG2Affine[B]) IsFull() bool { + return b.cptP == b.batchSize +} - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +func (b *BatchG2Affine[B]) ExecuteAndReset() { + if b.cptP == 0 { + return } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 } -func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +func (b *BatchG2Affine[B]) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + BK := &(*b.buckets)[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(P) + } else { + BK.Set(P) + } + return } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(P) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() + return + } } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = BK + if op.isNeg() { + b.P[b.cptP].Neg(P) } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + b.P[b.cptP].Set(P) } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + b.cptP++ } -func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +} - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } +func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() } - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + batch := newBatchG2Affine(&buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG2Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) } - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + chRes <- total + } -func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -1834,29 +1017,25 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended + chChunks := make([]chan g2JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g2JacExtended, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g2JacExtended, 1<<(lastC-1)) + // TODO @gbotrel lastC restore. + msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -1879,5 +1058,104 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split }() } - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) +} + +type bucketG2AffineC1 [1 << (1 - 1)]G2Affine +type bucketG2AffineC2 [1 << (2 - 1)]G2Affine +type bucketG2AffineC3 [1 << (3 - 1)]G2Affine +type bucketG2AffineC4 [1 << (4 - 1)]G2Affine +type bucketG2AffineC5 [1 << (5 - 1)]G2Affine +type bucketG2AffineC6 [1 << (6 - 1)]G2Affine +type bucketG2AffineC7 [1 << (7 - 1)]G2Affine +type bucketG2AffineC8 [1 << (8 - 1)]G2Affine +type bucketG2AffineC9 [1 << (9 - 1)]G2Affine +type bucketG2AffineC10 [1 << (10 - 1)]G2Affine +type bucketG2AffineC11 [1 << (11 - 1)]G2Affine +type bucketG2AffineC12 [1 << (12 - 1)]G2Affine +type bucketG2AffineC13 [1 << (13 - 1)]G2Affine +type bucketG2AffineC14 [1 << (14 - 1)]G2Affine +type bucketG2AffineC15 [1 << (15 - 1)]G2Affine +type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +type bucketG2AffineC17 [1 << (17 - 1)]G2Affine +type bucketG2AffineC18 [1 << (18 - 1)]G2Affine +type bucketG2AffineC19 [1 << (19 - 1)]G2Affine +type bucketG2AffineC20 [1 << (20 - 1)]G2Affine +type bucketG2AffineC21 [1 << (21 - 1)]G2Affine +type bucketG2AffineC22 [1 << (22 - 1)]G2Affine +type bucketG2AffineC23 [1 << (23 - 1)]G2Affine +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended +type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended +type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended +type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended +type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended +type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended +type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended +type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended +type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended +type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended +type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended +type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended +type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended +type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended +type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended +type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended +type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended +type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended +type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended + +type ibG2Affine interface { + bucketG2AffineC1 | + bucketG2AffineC2 | + bucketG2AffineC3 | + bucketG2AffineC4 | + bucketG2AffineC5 | + bucketG2AffineC6 | + bucketG2AffineC7 | + bucketG2AffineC8 | + bucketG2AffineC9 | + bucketG2AffineC10 | + bucketG2AffineC11 | + bucketG2AffineC12 | + bucketG2AffineC13 | + bucketG2AffineC14 | + bucketG2AffineC15 | + bucketG2AffineC16 | + bucketG2AffineC17 | + bucketG2AffineC18 | + bucketG2AffineC19 | + bucketG2AffineC20 | + bucketG2AffineC21 | + bucketG2AffineC22 | + bucketG2AffineC23 +} + +type ibg2JacExtended interface { + bucketg2JacExtendedC1 | + bucketg2JacExtendedC2 | + bucketg2JacExtendedC3 | + bucketg2JacExtendedC4 | + bucketg2JacExtendedC5 | + bucketg2JacExtendedC6 | + bucketg2JacExtendedC7 | + bucketg2JacExtendedC8 | + bucketg2JacExtendedC9 | + bucketg2JacExtendedC10 | + bucketg2JacExtendedC11 | + bucketg2JacExtendedC12 | + bucketg2JacExtendedC13 | + bucketg2JacExtendedC14 | + bucketg2JacExtendedC15 | + bucketg2JacExtendedC16 | + bucketg2JacExtendedC17 | + bucketg2JacExtendedC18 | + bucketg2JacExtendedC19 | + bucketg2JacExtendedC20 | + bucketg2JacExtendedC21 | + bucketg2JacExtendedC22 | + bucketg2JacExtendedC23 } diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index 466e6499a1..c4acf67088 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - r16.msmC16(samplePoints[:], scalars16, true) + msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 5; i <= pow; i++ { + for i := 11; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - r16.msmC16(samplePoints[:], scalars16, true) + msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 5; i <= pow; i++ { + for i := 11; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index 25a18a9457..a66bb3aa70 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -221,7 +221,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -294,50 +294,74 @@ func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, spl switch c { + case 1: + msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) case 6: - p.msmC6(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) case 7: - p.msmC7(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) case 9: - p.msmC9(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) case 10: - p.msmC10(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) case 11: - p.msmC11(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) case 12: - p.msmC12(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) case 13: - p.msmC13(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) case 14: - p.msmC14(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) case 15: - p.msmC15(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) case 16: - p.msmC16(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) + + case 18: + msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) + + case 19: + msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) case 20: - p.msmC20(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) case 21: - p.msmC21(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) + + case 22: + msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) + + case 23: + msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") @@ -360,9 +384,8 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG1Affine(chunk uint64, +func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, - buckets []g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element) { @@ -370,6 +393,7 @@ func msmProcessChunkG1Affine(chunk uint64, mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -424,26 +448,36 @@ func msmProcessChunkG1Affine(chunk uint64, } -func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 4 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - +func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended + chChunks := make([]chan g1JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g1JacExtended, 1) } + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G1Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g1JacExtended, 1<<(lastC-1)) + // TODO @gbotrel last C restore. + msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- + } + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -466,1719 +500,325 @@ func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk b }() } - return msmReduceChunkG1Affine(p, c, chChunks[:]) + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } -func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 5 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExp(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the msmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + // for each msmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented msmC methods (the c we use must be in this slice) + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } } - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) -func (p *G1Jac) msmC6(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 6 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) + msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) } + close(chDone) + return p, nil +} - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) +func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + switch c { - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 1: + msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 2: + msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + case 3: + msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) -func (p *G1Jac) msmC7(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 7 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 4: + msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 5: + msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 6: + msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + case 7: + msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 8: + msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 9: + msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 10: + msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + case 11: + msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) -func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 8 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 12: + msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 13: + msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 14: + msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 15: + msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 16: + msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 17: + msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + case 18: + msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) -func (p *G1Jac) msmC9(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 9 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 19: + msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExp(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the msmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each msmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 4: - p.msmC4(points, scalars, splitFirstChunk) - - case 5: - p.msmC5(points, scalars, splitFirstChunk) - - case 6: - p.msmC6(points, scalars, splitFirstChunk) - - case 7: - p.msmC7(points, scalars, splitFirstChunk) - - case 8: - p.msmC8(points, scalars, splitFirstChunk) - - case 9: - p.msmC9(points, scalars, splitFirstChunk) - - case 10: - p.msmC10(points, scalars, splitFirstChunk) - - case 11: - p.msmC11(points, scalars, splitFirstChunk) - - case 12: - p.msmC12(points, scalars, splitFirstChunk) - - case 13: - p.msmC13(points, scalars, splitFirstChunk) - - case 14: - p.msmC14(points, scalars, splitFirstChunk) - - case 15: - p.msmC15(points, scalars, splitFirstChunk) - - case 16: - p.msmC16(points, scalars, splitFirstChunk) - - case 20: - p.msmC20(points, scalars, splitFirstChunk) - - case 21: - p.msmC21(points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } -} - -// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { - var _p g2JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -func msmProcessChunkG2Affine(chunk uint64, - chRes chan<- g2JacExtended, - buckets []g2JacExtended, - c uint64, - points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) - - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) - } - } - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 4 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 5 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC6(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 6 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC7(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 7 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 8 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC9(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 9 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + case 20: + msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + case 21: + msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 22: + msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 23: + msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + default: + panic("not implemented") } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) } -func (p *G2Jac) msmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { + var _p g2JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) } - return msmReduceChunkG2Affine(p, c, chChunks[:]) + return p.unsafeFromJacExtended(&_p) } -func (p *G2Jac) msmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) +func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) } - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + if bits == 0 { + continue + } - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + chRes <- total - return msmReduceChunkG2Affine(p, c, chChunks[:]) } -func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended + chChunks := make([]chan g2JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g2JacExtended, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G2Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g2JacExtended, 1<<(lastC-1)) + // TODO @gbotrel last C restore. + msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -2201,5 +841,5 @@ func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk }() } - return msmReduceChunkG2Affine(p, c, chChunks[:]) + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index 7970a61d7e..d9469a7fb1 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -93,7 +93,7 @@ func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, con // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -166,102 +166,111 @@ func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.E switch c { + case 1: + msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) case 6: - p.msmC6(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) case 7: - p.msmC7(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) case 9: - p.msmC9(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) case 10: - p.batchAffineMsmC10(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) case 11: - p.batchAffineMsmC11(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) case 12: - p.batchAffineMsmC12(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) case 13: - p.batchAffineMsmC13(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) case 14: - p.batchAffineMsmC14(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) case 15: - p.batchAffineMsmC15(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) case 16: - p.batchAffineMsmC16(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) + + case 18: + batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) + + case 19: + batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) case 20: - p.batchAffineMsmC20(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) case 21: - p.batchAffineMsmC21(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) + + case 22: + batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) + + case 23: + batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") } } -// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { - var _p g1JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -type BatchG1Affine struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - buckets, points []G1Affine +type BatchG1Affine[B ibG1Affine] struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G1Affine + buckets *B } -func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine { - batchSize := len(buckets) / 5 +func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { + batchSize := len(*buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG1Affine{ + return BatchG1Affine[B]{ buckets: buckets, points: points, batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(buckets)/2), + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), } } -func (b *BatchG1Affine) IsFull() bool { +func (b *BatchG1Affine[B]) IsFull() bool { return b.cptP == b.batchSize } -func (b *BatchG1Affine) ExecuteAndReset() { +func (b *BatchG1Affine[B]) ExecuteAndReset() { if b.cptP == 0 { return } @@ -276,45 +285,45 @@ func (b *BatchG1Affine) ExecuteAndReset() { b.cptP = 0 } -func (b *BatchG1Affine) CanAdd(bID uint32) bool { +func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { _, ok := b.bucketIds[bID] return !ok } -func (b *BatchG1Affine) Add(op batchOp) { +func (b *BatchG1Affine[B]) Add(op batchOp) { // CanAdd must be called before --> ensures bucket is not "used" in current batch - B := &b.buckets[op.bucketID] + BK := &(*b.buckets)[op.bucketID] P := &b.points[op.pointID>>1] if P.IsInfinity() { return } // handle special cases with inf or -P / P - if B.IsInfinity() { + if BK.IsInfinity() { if op.isNeg() { - B.Neg(P) + BK.Neg(P) } else { - B.Set(P) + BK.Set(P) } return } if op.isNeg() { // if bucket == P --> -P == 0 - if B.Equal(P) { - B.setInfinity() + if BK.Equal(P) { + BK.setInfinity() return } } else { // if bucket == -P, B == 0 - if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { - B.setInfinity() + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() return } } // b.bucketIds[b.cptP] = op.bucketID b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = B + b.R[b.cptP] = BK if op.isNeg() { b.P[b.cptP].Neg(P) } else { @@ -323,7 +332,7 @@ func (b *BatchG1Affine) Add(op batchOp) { b.cptP++ } -func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { +func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { for i := len(queue) - 1; i >= 0; i-- { if batch.CanAdd(queue[i].bucketID) { batch.Add(queue[i]) @@ -338,16 +347,15 @@ func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { } -func msmProcessChunkG1AffineBatchAffine(chunk uint64, +func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, - buckets []G1Affine, c uint64, points []G1Affine, scalars []fr.Element) { mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) - + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -364,7 +372,7 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64, s.shiftHigh = (c - nbBitsHigh) } - batch := newBatchG1Affine(buckets, points) + batch := newBatchG1Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 for i := 0; i < len(scalars); i++ { @@ -433,11 +441,12 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64, } -func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -445,29 +454,25 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended + chChunks := make([]chan g1JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g1JacExtended, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g1JacExtended, 1<<(lastC-1)) + // TODO @gbotrel lastC restore. + msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -490,1343 +495,521 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split }() } - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } -func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +type bucketG1AffineC1 [1 << (1 - 1)]G1Affine +type bucketG1AffineC2 [1 << (2 - 1)]G1Affine +type bucketG1AffineC3 [1 << (3 - 1)]G1Affine +type bucketG1AffineC4 [1 << (4 - 1)]G1Affine +type bucketG1AffineC5 [1 << (5 - 1)]G1Affine +type bucketG1AffineC6 [1 << (6 - 1)]G1Affine +type bucketG1AffineC7 [1 << (7 - 1)]G1Affine +type bucketG1AffineC8 [1 << (8 - 1)]G1Affine +type bucketG1AffineC9 [1 << (9 - 1)]G1Affine +type bucketG1AffineC10 [1 << (10 - 1)]G1Affine +type bucketG1AffineC11 [1 << (11 - 1)]G1Affine +type bucketG1AffineC12 [1 << (12 - 1)]G1Affine +type bucketG1AffineC13 [1 << (13 - 1)]G1Affine +type bucketG1AffineC14 [1 << (14 - 1)]G1Affine +type bucketG1AffineC15 [1 << (15 - 1)]G1Affine +type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +type bucketG1AffineC17 [1 << (17 - 1)]G1Affine +type bucketG1AffineC18 [1 << (18 - 1)]G1Affine +type bucketG1AffineC19 [1 << (19 - 1)]G1Affine +type bucketG1AffineC20 [1 << (20 - 1)]G1Affine +type bucketG1AffineC21 [1 << (21 - 1)]G1Affine +type bucketG1AffineC22 [1 << (22 - 1)]G1Affine +type bucketG1AffineC23 [1 << (23 - 1)]G1Affine +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended +type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended +type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended +type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended +type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended +type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended +type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended +type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended +type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended +type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended +type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended +type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended +type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended +type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended +type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended +type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended +type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended +type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended +type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended + +type ibG1Affine interface { + bucketG1AffineC1 | + bucketG1AffineC2 | + bucketG1AffineC3 | + bucketG1AffineC4 | + bucketG1AffineC5 | + bucketG1AffineC6 | + bucketG1AffineC7 | + bucketG1AffineC8 | + bucketG1AffineC9 | + bucketG1AffineC10 | + bucketG1AffineC11 | + bucketG1AffineC12 | + bucketG1AffineC13 | + bucketG1AffineC14 | + bucketG1AffineC15 | + bucketG1AffineC16 | + bucketG1AffineC17 | + bucketG1AffineC18 | + bucketG1AffineC19 | + bucketG1AffineC20 | + bucketG1AffineC21 | + bucketG1AffineC22 | + bucketG1AffineC23 +} - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +type ibg1JacExtended interface { + bucketg1JacExtendedC1 | + bucketg1JacExtendedC2 | + bucketg1JacExtendedC3 | + bucketg1JacExtendedC4 | + bucketg1JacExtendedC5 | + bucketg1JacExtendedC6 | + bucketg1JacExtendedC7 | + bucketg1JacExtendedC8 | + bucketg1JacExtendedC9 | + bucketg1JacExtendedC10 | + bucketg1JacExtendedC11 | + bucketg1JacExtendedC12 | + bucketg1JacExtendedC13 | + bucketg1JacExtendedC14 | + bucketg1JacExtendedC15 | + bucketg1JacExtendedC16 | + bucketg1JacExtendedC17 | + bucketg1JacExtendedC18 | + bucketg1JacExtendedC19 | + bucketg1JacExtendedC20 | + bucketg1JacExtendedC21 | + bucketg1JacExtendedC22 | + bucketg1JacExtendedC23 +} - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err } + p.FromJacobian(&_p) + return p, nil +} - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C } - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } -func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) } + close(chDone) + return p, nil +} - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } +func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + switch c { - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 1: + msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} + case 2: + msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) -func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 3: + msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 4: + msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 5: + msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } + case 6: + msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 7: + msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 8: + msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 9: + msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} + case 10: + batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) -func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 11: + batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 12: + batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 13: + batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } + case 14: + batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 15: + batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 16: + batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 4: - p.msmC4(points, scalars, splitFirstChunk) - - case 5: - p.msmC5(points, scalars, splitFirstChunk) - - case 6: - p.msmC6(points, scalars, splitFirstChunk) - - case 7: - p.msmC7(points, scalars, splitFirstChunk) - - case 8: - p.msmC8(points, scalars, splitFirstChunk) - - case 9: - p.msmC9(points, scalars, splitFirstChunk) - - case 10: - p.batchAffineMsmC10(points, scalars, splitFirstChunk) - - case 11: - p.batchAffineMsmC11(points, scalars, splitFirstChunk) - - case 12: - p.batchAffineMsmC12(points, scalars, splitFirstChunk) - - case 13: - p.batchAffineMsmC13(points, scalars, splitFirstChunk) - - case 14: - p.batchAffineMsmC14(points, scalars, splitFirstChunk) - - case 15: - p.batchAffineMsmC15(points, scalars, splitFirstChunk) - - case 16: - p.batchAffineMsmC16(points, scalars, splitFirstChunk) - - case 20: - p.batchAffineMsmC20(points, scalars, splitFirstChunk) - - case 21: - p.batchAffineMsmC21(points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } -} - -// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { - var _p g2JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -type BatchG2Affine struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - buckets, points []G2Affine -} - -func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine { - batchSize := len(buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG2Affine{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(buckets)/2), - } -} - -func (b *BatchG2Affine) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG2Affine) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG2Affine) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG2Affine) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - B := &b.buckets[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if B.IsInfinity() { - if op.isNeg() { - B.Neg(P) - } else { - B.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if B.Equal(P) { - B.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { - B.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = B - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} - -func msmProcessChunkG2AffineBatchAffine(chunk uint64, - chRes chan<- g2JacExtended, - buckets []G2Affine, - c uint64, - points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) - - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - batch := newBatchG2Affine(buckets, points) - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. - nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - op.bucketID = uint32(bits - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) - } else { - // sub - op.bucketID = (uint32(bits & ^msbWindow)) - op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) - } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() - nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - } - } else { - // put it in queue. - queue = append(queue, op) - } - } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() - for len(queue) != 0 { - queue = processQueueG2Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. - } - - // flush items in batch. - batch.ExecuteAndReset() - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { - runningSum.addMixed(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 17: + batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 18: + batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 19: + batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + case 20: + batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } + case 21: + batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 22: + batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 23: + batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + default: + panic("not implemented") } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) } -func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +type BatchG2Affine[B ibG2Affine] struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G2Affine + buckets *B +} - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) +func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { + batchSize := len(*buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + if batchSize <= 0 { + batchSize = 1 } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + return BatchG2Affine[B]{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), } +} - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } +func (b *BatchG2Affine[B]) IsFull() bool { + return b.cptP == b.batchSize +} - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +func (b *BatchG2Affine[B]) ExecuteAndReset() { + if b.cptP == 0 { + return } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 } -func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +func (b *BatchG2Affine[B]) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + BK := &(*b.buckets)[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(P) + } else { + BK.Set(P) + } + return } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(P) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() + return + } } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = BK + if op.isNeg() { + b.P[b.cptP].Neg(P) } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + b.P[b.cptP].Set(P) } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + b.cptP++ } -func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +} - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } +func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() } - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + batch := newBatchG2Affine(&buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG2Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) } - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + chRes <- total + } -func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -1834,29 +1017,25 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended + chChunks := make([]chan g2JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g2JacExtended, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g2JacExtended, 1<<(lastC-1)) + // TODO @gbotrel lastC restore. + msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -1879,5 +1058,104 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split }() } - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) +} + +type bucketG2AffineC1 [1 << (1 - 1)]G2Affine +type bucketG2AffineC2 [1 << (2 - 1)]G2Affine +type bucketG2AffineC3 [1 << (3 - 1)]G2Affine +type bucketG2AffineC4 [1 << (4 - 1)]G2Affine +type bucketG2AffineC5 [1 << (5 - 1)]G2Affine +type bucketG2AffineC6 [1 << (6 - 1)]G2Affine +type bucketG2AffineC7 [1 << (7 - 1)]G2Affine +type bucketG2AffineC8 [1 << (8 - 1)]G2Affine +type bucketG2AffineC9 [1 << (9 - 1)]G2Affine +type bucketG2AffineC10 [1 << (10 - 1)]G2Affine +type bucketG2AffineC11 [1 << (11 - 1)]G2Affine +type bucketG2AffineC12 [1 << (12 - 1)]G2Affine +type bucketG2AffineC13 [1 << (13 - 1)]G2Affine +type bucketG2AffineC14 [1 << (14 - 1)]G2Affine +type bucketG2AffineC15 [1 << (15 - 1)]G2Affine +type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +type bucketG2AffineC17 [1 << (17 - 1)]G2Affine +type bucketG2AffineC18 [1 << (18 - 1)]G2Affine +type bucketG2AffineC19 [1 << (19 - 1)]G2Affine +type bucketG2AffineC20 [1 << (20 - 1)]G2Affine +type bucketG2AffineC21 [1 << (21 - 1)]G2Affine +type bucketG2AffineC22 [1 << (22 - 1)]G2Affine +type bucketG2AffineC23 [1 << (23 - 1)]G2Affine +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended +type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended +type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended +type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended +type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended +type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended +type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended +type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended +type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended +type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended +type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended +type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended +type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended +type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended +type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended +type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended +type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended +type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended +type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended + +type ibG2Affine interface { + bucketG2AffineC1 | + bucketG2AffineC2 | + bucketG2AffineC3 | + bucketG2AffineC4 | + bucketG2AffineC5 | + bucketG2AffineC6 | + bucketG2AffineC7 | + bucketG2AffineC8 | + bucketG2AffineC9 | + bucketG2AffineC10 | + bucketG2AffineC11 | + bucketG2AffineC12 | + bucketG2AffineC13 | + bucketG2AffineC14 | + bucketG2AffineC15 | + bucketG2AffineC16 | + bucketG2AffineC17 | + bucketG2AffineC18 | + bucketG2AffineC19 | + bucketG2AffineC20 | + bucketG2AffineC21 | + bucketG2AffineC22 | + bucketG2AffineC23 +} + +type ibg2JacExtended interface { + bucketg2JacExtendedC1 | + bucketg2JacExtendedC2 | + bucketg2JacExtendedC3 | + bucketg2JacExtendedC4 | + bucketg2JacExtendedC5 | + bucketg2JacExtendedC6 | + bucketg2JacExtendedC7 | + bucketg2JacExtendedC8 | + bucketg2JacExtendedC9 | + bucketg2JacExtendedC10 | + bucketg2JacExtendedC11 | + bucketg2JacExtendedC12 | + bucketg2JacExtendedC13 | + bucketg2JacExtendedC14 | + bucketg2JacExtendedC15 | + bucketg2JacExtendedC16 | + bucketg2JacExtendedC17 | + bucketg2JacExtendedC18 | + bucketg2JacExtendedC19 | + bucketg2JacExtendedC20 | + bucketg2JacExtendedC21 | + bucketg2JacExtendedC22 | + bucketg2JacExtendedC23 } diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index 05f44f6112..4248afb29d 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - r16.msmC16(samplePoints[:], scalars16, true) + msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 5; i <= pow; i++ { + for i := 11; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - r16.msmC16(samplePoints[:], scalars16, true) + msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 5; i <= pow; i++ { + for i := 11; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index 0c3d0039b2..f97aa4e2f4 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -221,7 +221,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -294,50 +294,74 @@ func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, spl switch c { + case 1: + msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) case 6: - p.msmC6(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) case 7: - p.msmC7(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) case 9: - p.msmC9(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) case 10: - p.msmC10(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) case 11: - p.msmC11(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) case 12: - p.msmC12(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) case 13: - p.msmC13(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) case 14: - p.msmC14(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) case 15: - p.msmC15(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) case 16: - p.msmC16(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) + + case 18: + msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) + + case 19: + msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) case 20: - p.msmC20(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) case 21: - p.msmC21(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) + + case 22: + msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) + + case 23: + msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") @@ -360,9 +384,8 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG1Affine(chunk uint64, +func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, - buckets []g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element) { @@ -370,6 +393,7 @@ func msmProcessChunkG1Affine(chunk uint64, mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -424,26 +448,36 @@ func msmProcessChunkG1Affine(chunk uint64, } -func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 4 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - +func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended + chChunks := make([]chan g1JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g1JacExtended, 1) } + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G1Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g1JacExtended, 1<<(lastC-1)) + // TODO @gbotrel last C restore. + msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- + } + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -466,1719 +500,325 @@ func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk b }() } - return msmReduceChunkG1Affine(p, c, chChunks[:]) + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } -func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 5 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExp(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the msmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + // for each msmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented msmC methods (the c we use must be in this slice) + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } } - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) -func (p *G1Jac) msmC6(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 6 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) + msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) } + close(chDone) + return p, nil +} - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) +func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + switch c { - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 1: + msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 2: + msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + case 3: + msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) -func (p *G1Jac) msmC7(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 7 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 4: + msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 5: + msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 6: + msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + case 7: + msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 8: + msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 9: + msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 10: + msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + case 11: + msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) -func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 8 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 12: + msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 13: + msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 14: + msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 15: + msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 16: + msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 17: + msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + case 18: + msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) -func (p *G1Jac) msmC9(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 9 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 19: + msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExp(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the msmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each msmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 4: - p.msmC4(points, scalars, splitFirstChunk) - - case 5: - p.msmC5(points, scalars, splitFirstChunk) - - case 6: - p.msmC6(points, scalars, splitFirstChunk) - - case 7: - p.msmC7(points, scalars, splitFirstChunk) - - case 8: - p.msmC8(points, scalars, splitFirstChunk) - - case 9: - p.msmC9(points, scalars, splitFirstChunk) - - case 10: - p.msmC10(points, scalars, splitFirstChunk) - - case 11: - p.msmC11(points, scalars, splitFirstChunk) - - case 12: - p.msmC12(points, scalars, splitFirstChunk) - - case 13: - p.msmC13(points, scalars, splitFirstChunk) - - case 14: - p.msmC14(points, scalars, splitFirstChunk) - - case 15: - p.msmC15(points, scalars, splitFirstChunk) - - case 16: - p.msmC16(points, scalars, splitFirstChunk) - - case 20: - p.msmC20(points, scalars, splitFirstChunk) - - case 21: - p.msmC21(points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } -} - -// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { - var _p g2JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -func msmProcessChunkG2Affine(chunk uint64, - chRes chan<- g2JacExtended, - buckets []g2JacExtended, - c uint64, - points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) - - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) - } - } - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 4 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 5 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC6(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 6 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC7(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 7 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 8 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC9(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 9 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + case 20: + msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + case 21: + msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 22: + msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 23: + msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + default: + panic("not implemented") } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) } -func (p *G2Jac) msmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { + var _p g2JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) } - return msmReduceChunkG2Affine(p, c, chChunks[:]) + return p.unsafeFromJacExtended(&_p) } -func (p *G2Jac) msmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) +func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) } - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + if bits == 0 { + continue + } - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + chRes <- total - return msmReduceChunkG2Affine(p, c, chChunks[:]) } -func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended + chChunks := make([]chan g2JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g2JacExtended, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G2Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g2JacExtended, 1<<(lastC-1)) + // TODO @gbotrel last C restore. + msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -2201,5 +841,5 @@ func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk }() } - return msmReduceChunkG2Affine(p, c, chChunks[:]) + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index e08952a333..c2d52847f3 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -93,7 +93,7 @@ func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, con // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -166,102 +166,111 @@ func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.E switch c { + case 1: + msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) case 6: - p.msmC6(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) case 7: - p.msmC7(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) case 9: - p.msmC9(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) case 10: - p.batchAffineMsmC10(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) case 11: - p.batchAffineMsmC11(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) case 12: - p.batchAffineMsmC12(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) case 13: - p.batchAffineMsmC13(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) case 14: - p.batchAffineMsmC14(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) case 15: - p.batchAffineMsmC15(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) case 16: - p.batchAffineMsmC16(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) + + case 18: + batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) + + case 19: + batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) case 20: - p.batchAffineMsmC20(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) case 21: - p.batchAffineMsmC21(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) + + case 22: + batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) + + case 23: + batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") } } -// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { - var _p g1JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -type BatchG1Affine struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - buckets, points []G1Affine +type BatchG1Affine[B ibG1Affine] struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G1Affine + buckets *B } -func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine { - batchSize := len(buckets) / 5 +func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { + batchSize := len(*buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG1Affine{ + return BatchG1Affine[B]{ buckets: buckets, points: points, batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(buckets)/2), + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), } } -func (b *BatchG1Affine) IsFull() bool { +func (b *BatchG1Affine[B]) IsFull() bool { return b.cptP == b.batchSize } -func (b *BatchG1Affine) ExecuteAndReset() { +func (b *BatchG1Affine[B]) ExecuteAndReset() { if b.cptP == 0 { return } @@ -276,45 +285,45 @@ func (b *BatchG1Affine) ExecuteAndReset() { b.cptP = 0 } -func (b *BatchG1Affine) CanAdd(bID uint32) bool { +func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { _, ok := b.bucketIds[bID] return !ok } -func (b *BatchG1Affine) Add(op batchOp) { +func (b *BatchG1Affine[B]) Add(op batchOp) { // CanAdd must be called before --> ensures bucket is not "used" in current batch - B := &b.buckets[op.bucketID] + BK := &(*b.buckets)[op.bucketID] P := &b.points[op.pointID>>1] if P.IsInfinity() { return } // handle special cases with inf or -P / P - if B.IsInfinity() { + if BK.IsInfinity() { if op.isNeg() { - B.Neg(P) + BK.Neg(P) } else { - B.Set(P) + BK.Set(P) } return } if op.isNeg() { // if bucket == P --> -P == 0 - if B.Equal(P) { - B.setInfinity() + if BK.Equal(P) { + BK.setInfinity() return } } else { // if bucket == -P, B == 0 - if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { - B.setInfinity() + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() return } } // b.bucketIds[b.cptP] = op.bucketID b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = B + b.R[b.cptP] = BK if op.isNeg() { b.P[b.cptP].Neg(P) } else { @@ -323,7 +332,7 @@ func (b *BatchG1Affine) Add(op batchOp) { b.cptP++ } -func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { +func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { for i := len(queue) - 1; i >= 0; i-- { if batch.CanAdd(queue[i].bucketID) { batch.Add(queue[i]) @@ -338,16 +347,15 @@ func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { } -func msmProcessChunkG1AffineBatchAffine(chunk uint64, +func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, - buckets []G1Affine, c uint64, points []G1Affine, scalars []fr.Element) { mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) - + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -364,7 +372,7 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64, s.shiftHigh = (c - nbBitsHigh) } - batch := newBatchG1Affine(buckets, points) + batch := newBatchG1Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 for i := 0; i < len(scalars); i++ { @@ -433,11 +441,12 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64, } -func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -445,29 +454,25 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended + chChunks := make([]chan g1JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g1JacExtended, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g1JacExtended, 1<<(lastC-1)) + // TODO @gbotrel lastC restore. + msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -490,1343 +495,521 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split }() } - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } -func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +type bucketG1AffineC1 [1 << (1 - 1)]G1Affine +type bucketG1AffineC2 [1 << (2 - 1)]G1Affine +type bucketG1AffineC3 [1 << (3 - 1)]G1Affine +type bucketG1AffineC4 [1 << (4 - 1)]G1Affine +type bucketG1AffineC5 [1 << (5 - 1)]G1Affine +type bucketG1AffineC6 [1 << (6 - 1)]G1Affine +type bucketG1AffineC7 [1 << (7 - 1)]G1Affine +type bucketG1AffineC8 [1 << (8 - 1)]G1Affine +type bucketG1AffineC9 [1 << (9 - 1)]G1Affine +type bucketG1AffineC10 [1 << (10 - 1)]G1Affine +type bucketG1AffineC11 [1 << (11 - 1)]G1Affine +type bucketG1AffineC12 [1 << (12 - 1)]G1Affine +type bucketG1AffineC13 [1 << (13 - 1)]G1Affine +type bucketG1AffineC14 [1 << (14 - 1)]G1Affine +type bucketG1AffineC15 [1 << (15 - 1)]G1Affine +type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +type bucketG1AffineC17 [1 << (17 - 1)]G1Affine +type bucketG1AffineC18 [1 << (18 - 1)]G1Affine +type bucketG1AffineC19 [1 << (19 - 1)]G1Affine +type bucketG1AffineC20 [1 << (20 - 1)]G1Affine +type bucketG1AffineC21 [1 << (21 - 1)]G1Affine +type bucketG1AffineC22 [1 << (22 - 1)]G1Affine +type bucketG1AffineC23 [1 << (23 - 1)]G1Affine +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended +type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended +type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended +type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended +type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended +type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended +type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended +type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended +type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended +type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended +type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended +type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended +type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended +type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended +type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended +type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended +type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended +type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended +type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended + +type ibG1Affine interface { + bucketG1AffineC1 | + bucketG1AffineC2 | + bucketG1AffineC3 | + bucketG1AffineC4 | + bucketG1AffineC5 | + bucketG1AffineC6 | + bucketG1AffineC7 | + bucketG1AffineC8 | + bucketG1AffineC9 | + bucketG1AffineC10 | + bucketG1AffineC11 | + bucketG1AffineC12 | + bucketG1AffineC13 | + bucketG1AffineC14 | + bucketG1AffineC15 | + bucketG1AffineC16 | + bucketG1AffineC17 | + bucketG1AffineC18 | + bucketG1AffineC19 | + bucketG1AffineC20 | + bucketG1AffineC21 | + bucketG1AffineC22 | + bucketG1AffineC23 +} - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +type ibg1JacExtended interface { + bucketg1JacExtendedC1 | + bucketg1JacExtendedC2 | + bucketg1JacExtendedC3 | + bucketg1JacExtendedC4 | + bucketg1JacExtendedC5 | + bucketg1JacExtendedC6 | + bucketg1JacExtendedC7 | + bucketg1JacExtendedC8 | + bucketg1JacExtendedC9 | + bucketg1JacExtendedC10 | + bucketg1JacExtendedC11 | + bucketg1JacExtendedC12 | + bucketg1JacExtendedC13 | + bucketg1JacExtendedC14 | + bucketg1JacExtendedC15 | + bucketg1JacExtendedC16 | + bucketg1JacExtendedC17 | + bucketg1JacExtendedC18 | + bucketg1JacExtendedC19 | + bucketg1JacExtendedC20 | + bucketg1JacExtendedC21 | + bucketg1JacExtendedC22 | + bucketg1JacExtendedC23 +} - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err } + p.FromJacobian(&_p) + return p, nil +} - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C } - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } -func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) } + close(chDone) + return p, nil +} - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } +func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + switch c { - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 1: + msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} + case 2: + msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) -func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 3: + msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 4: + msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 5: + msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } + case 6: + msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 7: + msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 8: + msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 9: + msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} + case 10: + batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) -func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 11: + batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 12: + batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 13: + batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } + case 14: + batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 15: + batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 16: + batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 4: - p.msmC4(points, scalars, splitFirstChunk) - - case 5: - p.msmC5(points, scalars, splitFirstChunk) - - case 6: - p.msmC6(points, scalars, splitFirstChunk) - - case 7: - p.msmC7(points, scalars, splitFirstChunk) - - case 8: - p.msmC8(points, scalars, splitFirstChunk) - - case 9: - p.msmC9(points, scalars, splitFirstChunk) - - case 10: - p.batchAffineMsmC10(points, scalars, splitFirstChunk) - - case 11: - p.batchAffineMsmC11(points, scalars, splitFirstChunk) - - case 12: - p.batchAffineMsmC12(points, scalars, splitFirstChunk) - - case 13: - p.batchAffineMsmC13(points, scalars, splitFirstChunk) - - case 14: - p.batchAffineMsmC14(points, scalars, splitFirstChunk) - - case 15: - p.batchAffineMsmC15(points, scalars, splitFirstChunk) - - case 16: - p.batchAffineMsmC16(points, scalars, splitFirstChunk) - - case 20: - p.batchAffineMsmC20(points, scalars, splitFirstChunk) - - case 21: - p.batchAffineMsmC21(points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } -} - -// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { - var _p g2JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -type BatchG2Affine struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - buckets, points []G2Affine -} - -func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine { - batchSize := len(buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG2Affine{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(buckets)/2), - } -} - -func (b *BatchG2Affine) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG2Affine) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG2Affine) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG2Affine) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - B := &b.buckets[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if B.IsInfinity() { - if op.isNeg() { - B.Neg(P) - } else { - B.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if B.Equal(P) { - B.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { - B.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = B - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} - -func msmProcessChunkG2AffineBatchAffine(chunk uint64, - chRes chan<- g2JacExtended, - buckets []G2Affine, - c uint64, - points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) - - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - batch := newBatchG2Affine(buckets, points) - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. - nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - op.bucketID = uint32(bits - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) - } else { - // sub - op.bucketID = (uint32(bits & ^msbWindow)) - op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) - } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() - nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - } - } else { - // put it in queue. - queue = append(queue, op) - } - } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() - for len(queue) != 0 { - queue = processQueueG2Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. - } - - // flush items in batch. - batch.ExecuteAndReset() - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { - runningSum.addMixed(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 17: + batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 18: + batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 19: + batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + case 20: + batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } + case 21: + batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 22: + batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 23: + batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + default: + panic("not implemented") } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) } -func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +type BatchG2Affine[B ibG2Affine] struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G2Affine + buckets *B +} - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) +func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { + batchSize := len(*buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + if batchSize <= 0 { + batchSize = 1 } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + return BatchG2Affine[B]{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), } +} - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } +func (b *BatchG2Affine[B]) IsFull() bool { + return b.cptP == b.batchSize +} - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +func (b *BatchG2Affine[B]) ExecuteAndReset() { + if b.cptP == 0 { + return } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 } -func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +func (b *BatchG2Affine[B]) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + BK := &(*b.buckets)[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(P) + } else { + BK.Set(P) + } + return } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(P) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() + return + } } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = BK + if op.isNeg() { + b.P[b.cptP].Neg(P) } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + b.P[b.cptP].Set(P) } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + b.cptP++ } -func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +} - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } +func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() } - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + batch := newBatchG2Affine(&buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG2Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) } - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + chRes <- total + } -func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -1834,29 +1017,25 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended + chChunks := make([]chan g2JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g2JacExtended, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g2JacExtended, 1<<(lastC-1)) + // TODO @gbotrel lastC restore. + msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -1879,5 +1058,104 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split }() } - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) +} + +type bucketG2AffineC1 [1 << (1 - 1)]G2Affine +type bucketG2AffineC2 [1 << (2 - 1)]G2Affine +type bucketG2AffineC3 [1 << (3 - 1)]G2Affine +type bucketG2AffineC4 [1 << (4 - 1)]G2Affine +type bucketG2AffineC5 [1 << (5 - 1)]G2Affine +type bucketG2AffineC6 [1 << (6 - 1)]G2Affine +type bucketG2AffineC7 [1 << (7 - 1)]G2Affine +type bucketG2AffineC8 [1 << (8 - 1)]G2Affine +type bucketG2AffineC9 [1 << (9 - 1)]G2Affine +type bucketG2AffineC10 [1 << (10 - 1)]G2Affine +type bucketG2AffineC11 [1 << (11 - 1)]G2Affine +type bucketG2AffineC12 [1 << (12 - 1)]G2Affine +type bucketG2AffineC13 [1 << (13 - 1)]G2Affine +type bucketG2AffineC14 [1 << (14 - 1)]G2Affine +type bucketG2AffineC15 [1 << (15 - 1)]G2Affine +type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +type bucketG2AffineC17 [1 << (17 - 1)]G2Affine +type bucketG2AffineC18 [1 << (18 - 1)]G2Affine +type bucketG2AffineC19 [1 << (19 - 1)]G2Affine +type bucketG2AffineC20 [1 << (20 - 1)]G2Affine +type bucketG2AffineC21 [1 << (21 - 1)]G2Affine +type bucketG2AffineC22 [1 << (22 - 1)]G2Affine +type bucketG2AffineC23 [1 << (23 - 1)]G2Affine +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended +type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended +type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended +type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended +type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended +type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended +type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended +type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended +type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended +type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended +type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended +type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended +type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended +type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended +type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended +type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended +type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended +type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended +type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended + +type ibG2Affine interface { + bucketG2AffineC1 | + bucketG2AffineC2 | + bucketG2AffineC3 | + bucketG2AffineC4 | + bucketG2AffineC5 | + bucketG2AffineC6 | + bucketG2AffineC7 | + bucketG2AffineC8 | + bucketG2AffineC9 | + bucketG2AffineC10 | + bucketG2AffineC11 | + bucketG2AffineC12 | + bucketG2AffineC13 | + bucketG2AffineC14 | + bucketG2AffineC15 | + bucketG2AffineC16 | + bucketG2AffineC17 | + bucketG2AffineC18 | + bucketG2AffineC19 | + bucketG2AffineC20 | + bucketG2AffineC21 | + bucketG2AffineC22 | + bucketG2AffineC23 +} + +type ibg2JacExtended interface { + bucketg2JacExtendedC1 | + bucketg2JacExtendedC2 | + bucketg2JacExtendedC3 | + bucketg2JacExtendedC4 | + bucketg2JacExtendedC5 | + bucketg2JacExtendedC6 | + bucketg2JacExtendedC7 | + bucketg2JacExtendedC8 | + bucketg2JacExtendedC9 | + bucketg2JacExtendedC10 | + bucketg2JacExtendedC11 | + bucketg2JacExtendedC12 | + bucketg2JacExtendedC13 | + bucketg2JacExtendedC14 | + bucketg2JacExtendedC15 | + bucketg2JacExtendedC16 | + bucketg2JacExtendedC17 | + bucketg2JacExtendedC18 | + bucketg2JacExtendedC19 | + bucketg2JacExtendedC20 | + bucketg2JacExtendedC21 | + bucketg2JacExtendedC22 | + bucketg2JacExtendedC23 } diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index 6f5611f563..a942e21132 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - r16.msmC16(samplePoints[:], scalars16, true) + msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 5; i <= pow; i++ { + for i := 11; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - r16.msmC16(samplePoints[:], scalars16, true) + msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 5; i <= pow; i++ { + for i := 11; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index 5a1b6797d2..1950ae3ef6 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -221,7 +221,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -294,50 +294,74 @@ func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, spl switch c { + case 1: + msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) case 6: - p.msmC6(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) case 7: - p.msmC7(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) case 9: - p.msmC9(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) case 10: - p.msmC10(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) case 11: - p.msmC11(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) case 12: - p.msmC12(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) case 13: - p.msmC13(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) case 14: - p.msmC14(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) case 15: - p.msmC15(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) case 16: - p.msmC16(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) + + case 18: + msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) + + case 19: + msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) case 20: - p.msmC20(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) case 21: - p.msmC21(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) + + case 22: + msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) + + case 23: + msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") @@ -360,9 +384,8 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG1Affine(chunk uint64, +func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, - buckets []g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element) { @@ -370,6 +393,7 @@ func msmProcessChunkG1Affine(chunk uint64, mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -424,26 +448,36 @@ func msmProcessChunkG1Affine(chunk uint64, } -func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 4 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - +func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended + chChunks := make([]chan g1JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g1JacExtended, 1) } + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G1Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g1JacExtended, 1<<(lastC-1)) + // TODO @gbotrel last C restore. + msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- + } + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -466,1719 +500,325 @@ func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk b }() } - return msmReduceChunkG1Affine(p, c, chChunks[:]) + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } -func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 5 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExp(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the msmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + // for each msmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented msmC methods (the c we use must be in this slice) + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } } - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) -func (p *G1Jac) msmC6(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 6 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) + msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) } + close(chDone) + return p, nil +} - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) +func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + switch c { - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 1: + msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 2: + msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + case 3: + msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) -func (p *G1Jac) msmC7(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 7 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 4: + msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 5: + msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 6: + msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + case 7: + msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 8: + msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 9: + msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 10: + msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + case 11: + msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) -func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 8 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 12: + msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 13: + msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 14: + msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 15: + msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 16: + msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 17: + msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + case 18: + msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) -func (p *G1Jac) msmC9(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 9 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 19: + msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExp(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the msmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each msmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 4: - p.msmC4(points, scalars, splitFirstChunk) - - case 5: - p.msmC5(points, scalars, splitFirstChunk) - - case 6: - p.msmC6(points, scalars, splitFirstChunk) - - case 7: - p.msmC7(points, scalars, splitFirstChunk) - - case 8: - p.msmC8(points, scalars, splitFirstChunk) - - case 9: - p.msmC9(points, scalars, splitFirstChunk) - - case 10: - p.msmC10(points, scalars, splitFirstChunk) - - case 11: - p.msmC11(points, scalars, splitFirstChunk) - - case 12: - p.msmC12(points, scalars, splitFirstChunk) - - case 13: - p.msmC13(points, scalars, splitFirstChunk) - - case 14: - p.msmC14(points, scalars, splitFirstChunk) - - case 15: - p.msmC15(points, scalars, splitFirstChunk) - - case 16: - p.msmC16(points, scalars, splitFirstChunk) - - case 20: - p.msmC20(points, scalars, splitFirstChunk) - - case 21: - p.msmC21(points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } -} - -// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { - var _p g2JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -func msmProcessChunkG2Affine(chunk uint64, - chRes chan<- g2JacExtended, - buckets []g2JacExtended, - c uint64, - points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) - - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) - } - } - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 4 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 5 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC6(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 6 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC7(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 7 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 8 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC9(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 9 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + case 20: + msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + case 21: + msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 22: + msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 23: + msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + default: + panic("not implemented") } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) } -func (p *G2Jac) msmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { + var _p g2JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) } - return msmReduceChunkG2Affine(p, c, chChunks[:]) + return p.unsafeFromJacExtended(&_p) } -func (p *G2Jac) msmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) +func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) } - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + if bits == 0 { + continue + } - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + chRes <- total - return msmReduceChunkG2Affine(p, c, chChunks[:]) } -func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended + chChunks := make([]chan g2JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g2JacExtended, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G2Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g2JacExtended, 1<<(lastC-1)) + // TODO @gbotrel last C restore. + msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -2201,5 +841,5 @@ func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk }() } - return msmReduceChunkG2Affine(p, c, chChunks[:]) + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index 6623e42510..1965405349 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -93,7 +93,7 @@ func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, con // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -166,102 +166,111 @@ func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.E switch c { + case 1: + msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) case 6: - p.msmC6(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) case 7: - p.msmC7(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) case 9: - p.msmC9(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) case 10: - p.batchAffineMsmC10(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) case 11: - p.batchAffineMsmC11(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) case 12: - p.batchAffineMsmC12(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) case 13: - p.batchAffineMsmC13(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) case 14: - p.batchAffineMsmC14(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) case 15: - p.batchAffineMsmC15(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) case 16: - p.batchAffineMsmC16(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) + + case 18: + batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) + + case 19: + batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) case 20: - p.batchAffineMsmC20(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) case 21: - p.batchAffineMsmC21(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) + + case 22: + batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) + + case 23: + batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") } } -// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { - var _p g1JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -type BatchG1Affine struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - buckets, points []G1Affine +type BatchG1Affine[B ibG1Affine] struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G1Affine + buckets *B } -func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine { - batchSize := len(buckets) / 5 +func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { + batchSize := len(*buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG1Affine{ + return BatchG1Affine[B]{ buckets: buckets, points: points, batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(buckets)/2), + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), } } -func (b *BatchG1Affine) IsFull() bool { +func (b *BatchG1Affine[B]) IsFull() bool { return b.cptP == b.batchSize } -func (b *BatchG1Affine) ExecuteAndReset() { +func (b *BatchG1Affine[B]) ExecuteAndReset() { if b.cptP == 0 { return } @@ -276,45 +285,45 @@ func (b *BatchG1Affine) ExecuteAndReset() { b.cptP = 0 } -func (b *BatchG1Affine) CanAdd(bID uint32) bool { +func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { _, ok := b.bucketIds[bID] return !ok } -func (b *BatchG1Affine) Add(op batchOp) { +func (b *BatchG1Affine[B]) Add(op batchOp) { // CanAdd must be called before --> ensures bucket is not "used" in current batch - B := &b.buckets[op.bucketID] + BK := &(*b.buckets)[op.bucketID] P := &b.points[op.pointID>>1] if P.IsInfinity() { return } // handle special cases with inf or -P / P - if B.IsInfinity() { + if BK.IsInfinity() { if op.isNeg() { - B.Neg(P) + BK.Neg(P) } else { - B.Set(P) + BK.Set(P) } return } if op.isNeg() { // if bucket == P --> -P == 0 - if B.Equal(P) { - B.setInfinity() + if BK.Equal(P) { + BK.setInfinity() return } } else { // if bucket == -P, B == 0 - if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { - B.setInfinity() + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() return } } // b.bucketIds[b.cptP] = op.bucketID b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = B + b.R[b.cptP] = BK if op.isNeg() { b.P[b.cptP].Neg(P) } else { @@ -323,7 +332,7 @@ func (b *BatchG1Affine) Add(op batchOp) { b.cptP++ } -func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { +func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { for i := len(queue) - 1; i >= 0; i-- { if batch.CanAdd(queue[i].bucketID) { batch.Add(queue[i]) @@ -338,16 +347,15 @@ func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { } -func msmProcessChunkG1AffineBatchAffine(chunk uint64, +func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, - buckets []G1Affine, c uint64, points []G1Affine, scalars []fr.Element) { mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) - + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -364,7 +372,7 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64, s.shiftHigh = (c - nbBitsHigh) } - batch := newBatchG1Affine(buckets, points) + batch := newBatchG1Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 for i := 0; i < len(scalars); i++ { @@ -433,11 +441,12 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64, } -func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -445,29 +454,25 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended + chChunks := make([]chan g1JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g1JacExtended, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g1JacExtended, 1<<(lastC-1)) + // TODO @gbotrel lastC restore. + msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -490,1343 +495,521 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split }() } - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } -func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +type bucketG1AffineC1 [1 << (1 - 1)]G1Affine +type bucketG1AffineC2 [1 << (2 - 1)]G1Affine +type bucketG1AffineC3 [1 << (3 - 1)]G1Affine +type bucketG1AffineC4 [1 << (4 - 1)]G1Affine +type bucketG1AffineC5 [1 << (5 - 1)]G1Affine +type bucketG1AffineC6 [1 << (6 - 1)]G1Affine +type bucketG1AffineC7 [1 << (7 - 1)]G1Affine +type bucketG1AffineC8 [1 << (8 - 1)]G1Affine +type bucketG1AffineC9 [1 << (9 - 1)]G1Affine +type bucketG1AffineC10 [1 << (10 - 1)]G1Affine +type bucketG1AffineC11 [1 << (11 - 1)]G1Affine +type bucketG1AffineC12 [1 << (12 - 1)]G1Affine +type bucketG1AffineC13 [1 << (13 - 1)]G1Affine +type bucketG1AffineC14 [1 << (14 - 1)]G1Affine +type bucketG1AffineC15 [1 << (15 - 1)]G1Affine +type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +type bucketG1AffineC17 [1 << (17 - 1)]G1Affine +type bucketG1AffineC18 [1 << (18 - 1)]G1Affine +type bucketG1AffineC19 [1 << (19 - 1)]G1Affine +type bucketG1AffineC20 [1 << (20 - 1)]G1Affine +type bucketG1AffineC21 [1 << (21 - 1)]G1Affine +type bucketG1AffineC22 [1 << (22 - 1)]G1Affine +type bucketG1AffineC23 [1 << (23 - 1)]G1Affine +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended +type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended +type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended +type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended +type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended +type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended +type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended +type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended +type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended +type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended +type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended +type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended +type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended +type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended +type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended +type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended +type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended +type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended +type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended + +type ibG1Affine interface { + bucketG1AffineC1 | + bucketG1AffineC2 | + bucketG1AffineC3 | + bucketG1AffineC4 | + bucketG1AffineC5 | + bucketG1AffineC6 | + bucketG1AffineC7 | + bucketG1AffineC8 | + bucketG1AffineC9 | + bucketG1AffineC10 | + bucketG1AffineC11 | + bucketG1AffineC12 | + bucketG1AffineC13 | + bucketG1AffineC14 | + bucketG1AffineC15 | + bucketG1AffineC16 | + bucketG1AffineC17 | + bucketG1AffineC18 | + bucketG1AffineC19 | + bucketG1AffineC20 | + bucketG1AffineC21 | + bucketG1AffineC22 | + bucketG1AffineC23 +} - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +type ibg1JacExtended interface { + bucketg1JacExtendedC1 | + bucketg1JacExtendedC2 | + bucketg1JacExtendedC3 | + bucketg1JacExtendedC4 | + bucketg1JacExtendedC5 | + bucketg1JacExtendedC6 | + bucketg1JacExtendedC7 | + bucketg1JacExtendedC8 | + bucketg1JacExtendedC9 | + bucketg1JacExtendedC10 | + bucketg1JacExtendedC11 | + bucketg1JacExtendedC12 | + bucketg1JacExtendedC13 | + bucketg1JacExtendedC14 | + bucketg1JacExtendedC15 | + bucketg1JacExtendedC16 | + bucketg1JacExtendedC17 | + bucketg1JacExtendedC18 | + bucketg1JacExtendedC19 | + bucketg1JacExtendedC20 | + bucketg1JacExtendedC21 | + bucketg1JacExtendedC22 | + bucketg1JacExtendedC23 +} - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err } + p.FromJacobian(&_p) + return p, nil +} - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C } - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } -func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) } + close(chDone) + return p, nil +} - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } +func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + switch c { - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 1: + msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} + case 2: + msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) -func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 3: + msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 4: + msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 5: + msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } + case 6: + msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 7: + msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 8: + msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 9: + msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} + case 10: + batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) -func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 11: + batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 12: + batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 13: + batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } + case 14: + batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 15: + batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 16: + batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 4: - p.msmC4(points, scalars, splitFirstChunk) - - case 5: - p.msmC5(points, scalars, splitFirstChunk) - - case 6: - p.msmC6(points, scalars, splitFirstChunk) - - case 7: - p.msmC7(points, scalars, splitFirstChunk) - - case 8: - p.msmC8(points, scalars, splitFirstChunk) - - case 9: - p.msmC9(points, scalars, splitFirstChunk) - - case 10: - p.batchAffineMsmC10(points, scalars, splitFirstChunk) - - case 11: - p.batchAffineMsmC11(points, scalars, splitFirstChunk) - - case 12: - p.batchAffineMsmC12(points, scalars, splitFirstChunk) - - case 13: - p.batchAffineMsmC13(points, scalars, splitFirstChunk) - - case 14: - p.batchAffineMsmC14(points, scalars, splitFirstChunk) - - case 15: - p.batchAffineMsmC15(points, scalars, splitFirstChunk) - - case 16: - p.batchAffineMsmC16(points, scalars, splitFirstChunk) - - case 20: - p.batchAffineMsmC20(points, scalars, splitFirstChunk) - - case 21: - p.batchAffineMsmC21(points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } -} - -// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { - var _p g2JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -type BatchG2Affine struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - buckets, points []G2Affine -} - -func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine { - batchSize := len(buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG2Affine{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(buckets)/2), - } -} - -func (b *BatchG2Affine) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG2Affine) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG2Affine) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG2Affine) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - B := &b.buckets[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if B.IsInfinity() { - if op.isNeg() { - B.Neg(P) - } else { - B.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if B.Equal(P) { - B.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { - B.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = B - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} - -func msmProcessChunkG2AffineBatchAffine(chunk uint64, - chRes chan<- g2JacExtended, - buckets []G2Affine, - c uint64, - points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) - - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - batch := newBatchG2Affine(buckets, points) - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. - nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - op.bucketID = uint32(bits - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) - } else { - // sub - op.bucketID = (uint32(bits & ^msbWindow)) - op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) - } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() - nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - } - } else { - // put it in queue. - queue = append(queue, op) - } - } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() - for len(queue) != 0 { - queue = processQueueG2Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. - } - - // flush items in batch. - batch.ExecuteAndReset() - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { - runningSum.addMixed(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 17: + batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 18: + batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 19: + batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + case 20: + batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } + case 21: + batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 22: + batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 23: + batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + default: + panic("not implemented") } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) } -func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +type BatchG2Affine[B ibG2Affine] struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G2Affine + buckets *B +} - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) +func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { + batchSize := len(*buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + if batchSize <= 0 { + batchSize = 1 } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + return BatchG2Affine[B]{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), } +} - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } +func (b *BatchG2Affine[B]) IsFull() bool { + return b.cptP == b.batchSize +} - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +func (b *BatchG2Affine[B]) ExecuteAndReset() { + if b.cptP == 0 { + return } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 } -func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +func (b *BatchG2Affine[B]) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + BK := &(*b.buckets)[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(P) + } else { + BK.Set(P) + } + return } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(P) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() + return + } } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = BK + if op.isNeg() { + b.P[b.cptP].Neg(P) } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + b.P[b.cptP].Set(P) } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + b.cptP++ } -func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +} - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } +func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() } - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + batch := newBatchG2Affine(&buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG2Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) } - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + chRes <- total + } -func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -1834,29 +1017,25 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended + chChunks := make([]chan g2JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g2JacExtended, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g2JacExtended, 1<<(lastC-1)) + // TODO @gbotrel lastC restore. + msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -1879,5 +1058,104 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split }() } - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) +} + +type bucketG2AffineC1 [1 << (1 - 1)]G2Affine +type bucketG2AffineC2 [1 << (2 - 1)]G2Affine +type bucketG2AffineC3 [1 << (3 - 1)]G2Affine +type bucketG2AffineC4 [1 << (4 - 1)]G2Affine +type bucketG2AffineC5 [1 << (5 - 1)]G2Affine +type bucketG2AffineC6 [1 << (6 - 1)]G2Affine +type bucketG2AffineC7 [1 << (7 - 1)]G2Affine +type bucketG2AffineC8 [1 << (8 - 1)]G2Affine +type bucketG2AffineC9 [1 << (9 - 1)]G2Affine +type bucketG2AffineC10 [1 << (10 - 1)]G2Affine +type bucketG2AffineC11 [1 << (11 - 1)]G2Affine +type bucketG2AffineC12 [1 << (12 - 1)]G2Affine +type bucketG2AffineC13 [1 << (13 - 1)]G2Affine +type bucketG2AffineC14 [1 << (14 - 1)]G2Affine +type bucketG2AffineC15 [1 << (15 - 1)]G2Affine +type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +type bucketG2AffineC17 [1 << (17 - 1)]G2Affine +type bucketG2AffineC18 [1 << (18 - 1)]G2Affine +type bucketG2AffineC19 [1 << (19 - 1)]G2Affine +type bucketG2AffineC20 [1 << (20 - 1)]G2Affine +type bucketG2AffineC21 [1 << (21 - 1)]G2Affine +type bucketG2AffineC22 [1 << (22 - 1)]G2Affine +type bucketG2AffineC23 [1 << (23 - 1)]G2Affine +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended +type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended +type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended +type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended +type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended +type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended +type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended +type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended +type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended +type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended +type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended +type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended +type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended +type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended +type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended +type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended +type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended +type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended +type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended + +type ibG2Affine interface { + bucketG2AffineC1 | + bucketG2AffineC2 | + bucketG2AffineC3 | + bucketG2AffineC4 | + bucketG2AffineC5 | + bucketG2AffineC6 | + bucketG2AffineC7 | + bucketG2AffineC8 | + bucketG2AffineC9 | + bucketG2AffineC10 | + bucketG2AffineC11 | + bucketG2AffineC12 | + bucketG2AffineC13 | + bucketG2AffineC14 | + bucketG2AffineC15 | + bucketG2AffineC16 | + bucketG2AffineC17 | + bucketG2AffineC18 | + bucketG2AffineC19 | + bucketG2AffineC20 | + bucketG2AffineC21 | + bucketG2AffineC22 | + bucketG2AffineC23 +} + +type ibg2JacExtended interface { + bucketg2JacExtendedC1 | + bucketg2JacExtendedC2 | + bucketg2JacExtendedC3 | + bucketg2JacExtendedC4 | + bucketg2JacExtendedC5 | + bucketg2JacExtendedC6 | + bucketg2JacExtendedC7 | + bucketg2JacExtendedC8 | + bucketg2JacExtendedC9 | + bucketg2JacExtendedC10 | + bucketg2JacExtendedC11 | + bucketg2JacExtendedC12 | + bucketg2JacExtendedC13 | + bucketg2JacExtendedC14 | + bucketg2JacExtendedC15 | + bucketg2JacExtendedC16 | + bucketg2JacExtendedC17 | + bucketg2JacExtendedC18 | + bucketg2JacExtendedC19 | + bucketg2JacExtendedC20 | + bucketg2JacExtendedC21 | + bucketg2JacExtendedC22 | + bucketg2JacExtendedC23 } diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index b89f8a2375..293f3ec6ef 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - r16.msmC16(samplePoints[:], scalars16, true) + msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 5; i <= pow; i++ { + for i := 11; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - r16.msmC16(samplePoints[:], scalars16, true) + msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 5; i <= pow; i++ { + for i := 11; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index 4c120b4be4..c2ecab3d61 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -221,7 +221,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -294,50 +294,74 @@ func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, spl switch c { + case 1: + msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) case 6: - p.msmC6(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) case 7: - p.msmC7(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) case 9: - p.msmC9(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) case 10: - p.msmC10(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) case 11: - p.msmC11(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) case 12: - p.msmC12(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) case 13: - p.msmC13(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) case 14: - p.msmC14(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) case 15: - p.msmC15(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) case 16: - p.msmC16(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) + + case 18: + msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) + + case 19: + msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) case 20: - p.msmC20(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) case 21: - p.msmC21(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) + + case 22: + msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) + + case 23: + msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") @@ -360,9 +384,8 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG1Affine(chunk uint64, +func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, - buckets []g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element) { @@ -370,6 +393,7 @@ func msmProcessChunkG1Affine(chunk uint64, mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -424,26 +448,36 @@ func msmProcessChunkG1Affine(chunk uint64, } -func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 4 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - +func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended + chChunks := make([]chan g1JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g1JacExtended, 1) } + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G1Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g1JacExtended, 1<<(lastC-1)) + // TODO @gbotrel last C restore. + msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- + } + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -466,1719 +500,325 @@ func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk b }() } - return msmReduceChunkG1Affine(p, c, chChunks[:]) + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } -func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 5 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExp(points, scalars, config); err != nil { + return nil, err + } + p.FromJacobian(&_p) + return p, nil +} - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the msmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + // for each msmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented msmC methods (the c we use must be in this slice) + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } } - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) -func (p *G1Jac) msmC6(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 6 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) + msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) } + close(chDone) + return p, nil +} - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) +func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + switch c { - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 1: + msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 2: + msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + case 3: + msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) -func (p *G1Jac) msmC7(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 7 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 4: + msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 5: + msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 6: + msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + case 7: + msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 8: + msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 9: + msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 10: + msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + case 11: + msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) -func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 8 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 12: + msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 13: + msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 14: + msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 15: + msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 16: + msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 17: + msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} + case 18: + msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) -func (p *G1Jac) msmC9(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 9 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 19: + msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExp(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the msmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each msmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 4: - p.msmC4(points, scalars, splitFirstChunk) - - case 5: - p.msmC5(points, scalars, splitFirstChunk) - - case 6: - p.msmC6(points, scalars, splitFirstChunk) - - case 7: - p.msmC7(points, scalars, splitFirstChunk) - - case 8: - p.msmC8(points, scalars, splitFirstChunk) - - case 9: - p.msmC9(points, scalars, splitFirstChunk) - - case 10: - p.msmC10(points, scalars, splitFirstChunk) - - case 11: - p.msmC11(points, scalars, splitFirstChunk) - - case 12: - p.msmC12(points, scalars, splitFirstChunk) - - case 13: - p.msmC13(points, scalars, splitFirstChunk) - - case 14: - p.msmC14(points, scalars, splitFirstChunk) - - case 15: - p.msmC15(points, scalars, splitFirstChunk) - - case 16: - p.msmC16(points, scalars, splitFirstChunk) - - case 20: - p.msmC20(points, scalars, splitFirstChunk) - - case 21: - p.msmC21(points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } -} - -// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { - var _p g2JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -func msmProcessChunkG2Affine(chunk uint64, - chRes chan<- g2JacExtended, - buckets []g2JacExtended, - c uint64, - points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) - - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) - } - } - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 4 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 5 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC6(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 6 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC7(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 7 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 8 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC9(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 9 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + case 20: + msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + case 21: + msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 22: + msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 23: + msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + default: + panic("not implemented") } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) } -func (p *G2Jac) msmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { + var _p g2JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) } - return msmReduceChunkG2Affine(p, c, chChunks[:]) + return p.unsafeFromJacExtended(&_p) } -func (p *G2Jac) msmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) +func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) } - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + if bits == 0 { + continue + } - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + chRes <- total - return msmReduceChunkG2Affine(p, c, chChunks[:]) } -func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended + chChunks := make([]chan g2JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g2JacExtended, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G2Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g2JacExtended, 1<<(lastC-1)) + // TODO @gbotrel last C restore. + msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -2201,5 +841,5 @@ func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk }() } - return msmReduceChunkG2Affine(p, c, chChunks[:]) + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index d91b3cb89c..89db40edae 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -93,7 +93,7 @@ func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, con // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -166,102 +166,111 @@ func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.E switch c { + case 1: + msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) case 6: - p.msmC6(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) case 7: - p.msmC7(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) case 9: - p.msmC9(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) case 10: - p.batchAffineMsmC10(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) case 11: - p.batchAffineMsmC11(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) case 12: - p.batchAffineMsmC12(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) case 13: - p.batchAffineMsmC13(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) case 14: - p.batchAffineMsmC14(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) case 15: - p.batchAffineMsmC15(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) case 16: - p.batchAffineMsmC16(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) + + case 18: + batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) + + case 19: + batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) case 20: - p.batchAffineMsmC20(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) case 21: - p.batchAffineMsmC21(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) + + case 22: + batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) + + case 23: + batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") } } -// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { - var _p g1JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -type BatchG1Affine struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - buckets, points []G1Affine +type BatchG1Affine[B ibG1Affine] struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G1Affine + buckets *B } -func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine { - batchSize := len(buckets) / 5 +func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { + batchSize := len(*buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG1Affine{ + return BatchG1Affine[B]{ buckets: buckets, points: points, batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(buckets)/2), + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), } } -func (b *BatchG1Affine) IsFull() bool { +func (b *BatchG1Affine[B]) IsFull() bool { return b.cptP == b.batchSize } -func (b *BatchG1Affine) ExecuteAndReset() { +func (b *BatchG1Affine[B]) ExecuteAndReset() { if b.cptP == 0 { return } @@ -276,45 +285,45 @@ func (b *BatchG1Affine) ExecuteAndReset() { b.cptP = 0 } -func (b *BatchG1Affine) CanAdd(bID uint32) bool { +func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { _, ok := b.bucketIds[bID] return !ok } -func (b *BatchG1Affine) Add(op batchOp) { +func (b *BatchG1Affine[B]) Add(op batchOp) { // CanAdd must be called before --> ensures bucket is not "used" in current batch - B := &b.buckets[op.bucketID] + BK := &(*b.buckets)[op.bucketID] P := &b.points[op.pointID>>1] if P.IsInfinity() { return } // handle special cases with inf or -P / P - if B.IsInfinity() { + if BK.IsInfinity() { if op.isNeg() { - B.Neg(P) + BK.Neg(P) } else { - B.Set(P) + BK.Set(P) } return } if op.isNeg() { // if bucket == P --> -P == 0 - if B.Equal(P) { - B.setInfinity() + if BK.Equal(P) { + BK.setInfinity() return } } else { // if bucket == -P, B == 0 - if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { - B.setInfinity() + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() return } } // b.bucketIds[b.cptP] = op.bucketID b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = B + b.R[b.cptP] = BK if op.isNeg() { b.P[b.cptP].Neg(P) } else { @@ -323,7 +332,7 @@ func (b *BatchG1Affine) Add(op batchOp) { b.cptP++ } -func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { +func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { for i := len(queue) - 1; i >= 0; i-- { if batch.CanAdd(queue[i].bucketID) { batch.Add(queue[i]) @@ -338,16 +347,15 @@ func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { } -func msmProcessChunkG1AffineBatchAffine(chunk uint64, +func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, - buckets []G1Affine, c uint64, points []G1Affine, scalars []fr.Element) { mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) - + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -364,7 +372,7 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64, s.shiftHigh = (c - nbBitsHigh) } - batch := newBatchG1Affine(buckets, points) + batch := newBatchG1Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 for i := 0; i < len(scalars); i++ { @@ -433,11 +441,12 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64, } -func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -445,29 +454,25 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended + chChunks := make([]chan g1JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g1JacExtended, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g1JacExtended, 1<<(lastC-1)) + // TODO @gbotrel lastC restore. + msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -490,1343 +495,521 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split }() } - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } -func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +type bucketG1AffineC1 [1 << (1 - 1)]G1Affine +type bucketG1AffineC2 [1 << (2 - 1)]G1Affine +type bucketG1AffineC3 [1 << (3 - 1)]G1Affine +type bucketG1AffineC4 [1 << (4 - 1)]G1Affine +type bucketG1AffineC5 [1 << (5 - 1)]G1Affine +type bucketG1AffineC6 [1 << (6 - 1)]G1Affine +type bucketG1AffineC7 [1 << (7 - 1)]G1Affine +type bucketG1AffineC8 [1 << (8 - 1)]G1Affine +type bucketG1AffineC9 [1 << (9 - 1)]G1Affine +type bucketG1AffineC10 [1 << (10 - 1)]G1Affine +type bucketG1AffineC11 [1 << (11 - 1)]G1Affine +type bucketG1AffineC12 [1 << (12 - 1)]G1Affine +type bucketG1AffineC13 [1 << (13 - 1)]G1Affine +type bucketG1AffineC14 [1 << (14 - 1)]G1Affine +type bucketG1AffineC15 [1 << (15 - 1)]G1Affine +type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +type bucketG1AffineC17 [1 << (17 - 1)]G1Affine +type bucketG1AffineC18 [1 << (18 - 1)]G1Affine +type bucketG1AffineC19 [1 << (19 - 1)]G1Affine +type bucketG1AffineC20 [1 << (20 - 1)]G1Affine +type bucketG1AffineC21 [1 << (21 - 1)]G1Affine +type bucketG1AffineC22 [1 << (22 - 1)]G1Affine +type bucketG1AffineC23 [1 << (23 - 1)]G1Affine +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended +type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended +type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended +type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended +type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended +type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended +type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended +type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended +type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended +type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended +type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended +type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended +type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended +type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended +type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended +type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended +type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended +type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended +type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended + +type ibG1Affine interface { + bucketG1AffineC1 | + bucketG1AffineC2 | + bucketG1AffineC3 | + bucketG1AffineC4 | + bucketG1AffineC5 | + bucketG1AffineC6 | + bucketG1AffineC7 | + bucketG1AffineC8 | + bucketG1AffineC9 | + bucketG1AffineC10 | + bucketG1AffineC11 | + bucketG1AffineC12 | + bucketG1AffineC13 | + bucketG1AffineC14 | + bucketG1AffineC15 | + bucketG1AffineC16 | + bucketG1AffineC17 | + bucketG1AffineC18 | + bucketG1AffineC19 | + bucketG1AffineC20 | + bucketG1AffineC21 | + bucketG1AffineC22 | + bucketG1AffineC23 +} - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +type ibg1JacExtended interface { + bucketg1JacExtendedC1 | + bucketg1JacExtendedC2 | + bucketg1JacExtendedC3 | + bucketg1JacExtendedC4 | + bucketg1JacExtendedC5 | + bucketg1JacExtendedC6 | + bucketg1JacExtendedC7 | + bucketg1JacExtendedC8 | + bucketg1JacExtendedC9 | + bucketg1JacExtendedC10 | + bucketg1JacExtendedC11 | + bucketg1JacExtendedC12 | + bucketg1JacExtendedC13 | + bucketg1JacExtendedC14 | + bucketg1JacExtendedC15 | + bucketg1JacExtendedC16 | + bucketg1JacExtendedC17 | + bucketg1JacExtendedC18 | + bucketg1JacExtendedC19 | + bucketg1JacExtendedC20 | + bucketg1JacExtendedC21 | + bucketg1JacExtendedC22 | + bucketg1JacExtendedC23 +} - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { + var _p G2Jac + if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { + return nil, err } + p.FromJacobian(&_p) + return p, nil +} - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } +// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// +// This call return an error if len(scalars) != len(points) or if provided config is invalid. +func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { + // note: + // each of the batchAffineMsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + // for each batchAffineMsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } else if config.NbTasks > 1024 { + return nil, errors.New("invalid config: config.NbTasks > 1024") } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented batchAffineMsmC methods (the c we use must be in this slice) + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C } - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } -func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) + // we have nbSplits intermediate results that we must sum together. + _p := make([]G2Jac, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.AddAssign(&_p[done]) } + close(chDone) + return p, nil +} - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } +func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + switch c { - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 1: + msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} + case 2: + msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) -func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 3: + msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 4: + msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 5: + msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } + case 6: + msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 7: + msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 8: + msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + case 9: + msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} + case 10: + batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) -func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 11: + batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 12: + batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } + case 13: + batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } + case 14: + batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 15: + batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 16: + batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 4: - p.msmC4(points, scalars, splitFirstChunk) - - case 5: - p.msmC5(points, scalars, splitFirstChunk) - - case 6: - p.msmC6(points, scalars, splitFirstChunk) - - case 7: - p.msmC7(points, scalars, splitFirstChunk) - - case 8: - p.msmC8(points, scalars, splitFirstChunk) - - case 9: - p.msmC9(points, scalars, splitFirstChunk) - - case 10: - p.batchAffineMsmC10(points, scalars, splitFirstChunk) - - case 11: - p.batchAffineMsmC11(points, scalars, splitFirstChunk) - - case 12: - p.batchAffineMsmC12(points, scalars, splitFirstChunk) - - case 13: - p.batchAffineMsmC13(points, scalars, splitFirstChunk) - - case 14: - p.batchAffineMsmC14(points, scalars, splitFirstChunk) - - case 15: - p.batchAffineMsmC15(points, scalars, splitFirstChunk) - - case 16: - p.batchAffineMsmC16(points, scalars, splitFirstChunk) - - case 20: - p.batchAffineMsmC20(points, scalars, splitFirstChunk) - - case 21: - p.batchAffineMsmC21(points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } -} - -// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { - var _p g2JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -type BatchG2Affine struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - buckets, points []G2Affine -} - -func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine { - batchSize := len(buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG2Affine{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(buckets)/2), - } -} - -func (b *BatchG2Affine) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG2Affine) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG2Affine) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG2Affine) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - B := &b.buckets[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if B.IsInfinity() { - if op.isNeg() { - B.Neg(P) - } else { - B.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if B.Equal(P) { - B.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { - B.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = B - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} - -func msmProcessChunkG2AffineBatchAffine(chunk uint64, - chRes chan<- g2JacExtended, - buckets []G2Affine, - c uint64, - points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) - - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - batch := newBatchG2Affine(buckets, points) - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. - nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - op.bucketID = uint32(bits - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) - } else { - // sub - op.bucketID = (uint32(bits & ^msbWindow)) - op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) - } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() - nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - } - } else { - // put it in queue. - queue = append(queue, op) - } - } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() - for len(queue) != 0 { - queue = processQueueG2Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. - } - - // flush items in batch. - batch.ExecuteAndReset() - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { - runningSum.addMixed(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 10 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 11 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 12 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 13 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 17: + batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) -} - -func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 14 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) + case 18: + batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + case 19: + batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + case 20: + batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } + case 21: + batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) - } + case 22: + batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + case 23: + batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + default: + panic("not implemented") } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) } -func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 15 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +type BatchG2Affine[B ibG2Affine] struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G2Affine + buckets *B +} - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) +func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { + batchSize := len(*buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE } - - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + if batchSize <= 0 { + batchSize = 1 } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + return BatchG2Affine[B]{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), } +} - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } +func (b *BatchG2Affine[B]) IsFull() bool { + return b.cptP == b.batchSize +} - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +func (b *BatchG2Affine[B]) ExecuteAndReset() { + if b.cptP == 0 { + return } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 } -func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +func (b *BatchG2Affine[B]) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + BK := &(*b.buckets)[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(P) + } else { + BK.Set(P) + } + return } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(P) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() + return + } } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = BK + if op.isNeg() { + b.P[b.cptP].Neg(P) } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + b.P[b.cptP].Set(P) } - - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + b.cptP++ } -func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 20 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance +} - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } +func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() } - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) + batch := newBatchG2Affine(&buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } + } + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG2Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) } - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + chRes <- total + } -func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 21 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -1834,29 +1017,25 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended + chChunks := make([]chan g2JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g2JacExtended, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g2JacExtended, 1<<(lastC-1)) + // TODO @gbotrel lastC restore. + msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -1879,5 +1058,104 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split }() } - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) +} + +type bucketG2AffineC1 [1 << (1 - 1)]G2Affine +type bucketG2AffineC2 [1 << (2 - 1)]G2Affine +type bucketG2AffineC3 [1 << (3 - 1)]G2Affine +type bucketG2AffineC4 [1 << (4 - 1)]G2Affine +type bucketG2AffineC5 [1 << (5 - 1)]G2Affine +type bucketG2AffineC6 [1 << (6 - 1)]G2Affine +type bucketG2AffineC7 [1 << (7 - 1)]G2Affine +type bucketG2AffineC8 [1 << (8 - 1)]G2Affine +type bucketG2AffineC9 [1 << (9 - 1)]G2Affine +type bucketG2AffineC10 [1 << (10 - 1)]G2Affine +type bucketG2AffineC11 [1 << (11 - 1)]G2Affine +type bucketG2AffineC12 [1 << (12 - 1)]G2Affine +type bucketG2AffineC13 [1 << (13 - 1)]G2Affine +type bucketG2AffineC14 [1 << (14 - 1)]G2Affine +type bucketG2AffineC15 [1 << (15 - 1)]G2Affine +type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +type bucketG2AffineC17 [1 << (17 - 1)]G2Affine +type bucketG2AffineC18 [1 << (18 - 1)]G2Affine +type bucketG2AffineC19 [1 << (19 - 1)]G2Affine +type bucketG2AffineC20 [1 << (20 - 1)]G2Affine +type bucketG2AffineC21 [1 << (21 - 1)]G2Affine +type bucketG2AffineC22 [1 << (22 - 1)]G2Affine +type bucketG2AffineC23 [1 << (23 - 1)]G2Affine +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended +type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended +type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended +type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended +type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended +type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended +type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended +type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended +type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended +type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended +type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended +type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended +type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended +type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended +type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended +type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended +type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended +type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended +type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended + +type ibG2Affine interface { + bucketG2AffineC1 | + bucketG2AffineC2 | + bucketG2AffineC3 | + bucketG2AffineC4 | + bucketG2AffineC5 | + bucketG2AffineC6 | + bucketG2AffineC7 | + bucketG2AffineC8 | + bucketG2AffineC9 | + bucketG2AffineC10 | + bucketG2AffineC11 | + bucketG2AffineC12 | + bucketG2AffineC13 | + bucketG2AffineC14 | + bucketG2AffineC15 | + bucketG2AffineC16 | + bucketG2AffineC17 | + bucketG2AffineC18 | + bucketG2AffineC19 | + bucketG2AffineC20 | + bucketG2AffineC21 | + bucketG2AffineC22 | + bucketG2AffineC23 +} + +type ibg2JacExtended interface { + bucketg2JacExtendedC1 | + bucketg2JacExtendedC2 | + bucketg2JacExtendedC3 | + bucketg2JacExtendedC4 | + bucketg2JacExtendedC5 | + bucketg2JacExtendedC6 | + bucketg2JacExtendedC7 | + bucketg2JacExtendedC8 | + bucketg2JacExtendedC9 | + bucketg2JacExtendedC10 | + bucketg2JacExtendedC11 | + bucketg2JacExtendedC12 | + bucketg2JacExtendedC13 | + bucketg2JacExtendedC14 | + bucketg2JacExtendedC15 | + bucketg2JacExtendedC16 | + bucketg2JacExtendedC17 | + bucketg2JacExtendedC18 | + bucketg2JacExtendedC19 | + bucketg2JacExtendedC20 | + bucketg2JacExtendedC21 | + bucketg2JacExtendedC22 | + bucketg2JacExtendedC23 } diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index 61341020f0..8a8ee0e90d 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - r16.msmC16(samplePoints[:], scalars16, true) + msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 5; i <= pow; i++ { + for i := 11; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - r16.msmC16(samplePoints[:], scalars16, true) + msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 5; i <= pow; i++ { + for i := 11; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index 2006674935..e49016a90e 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -221,7 +221,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 16} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -294,17 +294,74 @@ func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, spl switch c { + case 1: + msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC2](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC5](p, 5, points, scalars, splitFirstChunk) + + case 6: + msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC2](p, 6, points, scalars, splitFirstChunk) + + case 7: + msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC5](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) + + case 9: + msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC5](p, 9, points, scalars, splitFirstChunk) + + case 10: + msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC10](p, 10, points, scalars, splitFirstChunk) + + case 11: + msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC1](p, 11, points, scalars, splitFirstChunk) + + case 12: + msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC8](p, 12, points, scalars, splitFirstChunk) + + case 13: + msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC8](p, 13, points, scalars, splitFirstChunk) + + case 14: + msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC12](p, 14, points, scalars, splitFirstChunk) + + case 15: + msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC5](p, 15, points, scalars, splitFirstChunk) case 16: - p.msmC16(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC14](p, 17, points, scalars, splitFirstChunk) + + case 18: + msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC14](p, 18, points, scalars, splitFirstChunk) + + case 19: + msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC16](p, 19, points, scalars, splitFirstChunk) + + case 20: + msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC20](p, 20, points, scalars, splitFirstChunk) + + case 21: + msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC5](p, 21, points, scalars, splitFirstChunk) + + case 22: + msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC12](p, 22, points, scalars, splitFirstChunk) + + case 23: + msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC21](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") @@ -327,9 +384,8 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG1Affine(chunk uint64, +func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, - buckets []g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element) { @@ -337,6 +393,7 @@ func msmProcessChunkG1Affine(chunk uint64, mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -391,161 +448,36 @@ func msmProcessChunkG1Affine(chunk uint64, } -func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 4 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 5 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 8 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended + chChunks := make([]chan g1JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g1JacExtended, 1) } - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G1Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g1JacExtended, 1<<(lastC-1)) + // TODO @gbotrel last C restore. + msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -568,7 +500,7 @@ func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk }() } - return msmReduceChunkG1Affine(p, c, chChunks[:]) + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf @@ -630,7 +562,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 16} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -703,17 +635,74 @@ func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, spl switch c { + case 1: + msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC2](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC5](p, 5, points, scalars, splitFirstChunk) + + case 6: + msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC2](p, 6, points, scalars, splitFirstChunk) + + case 7: + msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC5](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) + + case 9: + msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC5](p, 9, points, scalars, splitFirstChunk) + + case 10: + msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC10](p, 10, points, scalars, splitFirstChunk) + + case 11: + msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC1](p, 11, points, scalars, splitFirstChunk) + + case 12: + msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC8](p, 12, points, scalars, splitFirstChunk) + + case 13: + msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC8](p, 13, points, scalars, splitFirstChunk) + + case 14: + msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC12](p, 14, points, scalars, splitFirstChunk) + + case 15: + msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC5](p, 15, points, scalars, splitFirstChunk) case 16: - p.msmC16(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC14](p, 17, points, scalars, splitFirstChunk) + + case 18: + msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC14](p, 18, points, scalars, splitFirstChunk) + + case 19: + msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC16](p, 19, points, scalars, splitFirstChunk) + + case 20: + msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC20](p, 20, points, scalars, splitFirstChunk) + + case 21: + msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC5](p, 21, points, scalars, splitFirstChunk) + + case 22: + msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC12](p, 22, points, scalars, splitFirstChunk) + + case 23: + msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC21](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") @@ -736,9 +725,8 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG2Affine(chunk uint64, +func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, - buckets []g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element) { @@ -746,6 +734,7 @@ func msmProcessChunkG2Affine(chunk uint64, mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -800,161 +789,36 @@ func msmProcessChunkG2Affine(chunk uint64, } -func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 4 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 5 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 8 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended + chChunks := make([]chan g2JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g2JacExtended, 1) } - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G2Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g2JacExtended, 1<<(lastC-1)) + // TODO @gbotrel last C restore. + msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -977,5 +841,5 @@ func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk }() } - return msmReduceChunkG2Affine(p, c, chChunks[:]) + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index 79740b7a69..5064b454a6 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -93,7 +93,7 @@ func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, con // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 16} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -166,69 +166,111 @@ func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.E switch c { + case 1: + msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC2](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC5](p, 5, points, scalars, splitFirstChunk) + + case 6: + msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC2](p, 6, points, scalars, splitFirstChunk) + + case 7: + msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC5](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) + + case 9: + msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC5](p, 9, points, scalars, splitFirstChunk) + + case 10: + batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC10](p, 10, points, scalars, splitFirstChunk) + + case 11: + batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC1](p, 11, points, scalars, splitFirstChunk) + + case 12: + batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC8](p, 12, points, scalars, splitFirstChunk) + + case 13: + batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC8](p, 13, points, scalars, splitFirstChunk) + + case 14: + batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC12](p, 14, points, scalars, splitFirstChunk) + + case 15: + batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC5](p, 15, points, scalars, splitFirstChunk) case 16: - p.batchAffineMsmC16(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC14](p, 17, points, scalars, splitFirstChunk) + + case 18: + batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC14](p, 18, points, scalars, splitFirstChunk) + + case 19: + batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC16](p, 19, points, scalars, splitFirstChunk) + + case 20: + batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC20](p, 20, points, scalars, splitFirstChunk) + + case 21: + batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC5](p, 21, points, scalars, splitFirstChunk) + + case 22: + batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC12](p, 22, points, scalars, splitFirstChunk) + + case 23: + batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC21](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") } } -// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { - var _p g1JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -type BatchG1Affine struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - buckets, points []G1Affine +type BatchG1Affine[B ibG1Affine] struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G1Affine + buckets *B } -func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine { - batchSize := len(buckets) / 5 +func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { + batchSize := len(*buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG1Affine{ + return BatchG1Affine[B]{ buckets: buckets, points: points, batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(buckets)/2), + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), } } -func (b *BatchG1Affine) IsFull() bool { +func (b *BatchG1Affine[B]) IsFull() bool { return b.cptP == b.batchSize } -func (b *BatchG1Affine) ExecuteAndReset() { +func (b *BatchG1Affine[B]) ExecuteAndReset() { if b.cptP == 0 { return } @@ -243,45 +285,45 @@ func (b *BatchG1Affine) ExecuteAndReset() { b.cptP = 0 } -func (b *BatchG1Affine) CanAdd(bID uint32) bool { +func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { _, ok := b.bucketIds[bID] return !ok } -func (b *BatchG1Affine) Add(op batchOp) { +func (b *BatchG1Affine[B]) Add(op batchOp) { // CanAdd must be called before --> ensures bucket is not "used" in current batch - B := &b.buckets[op.bucketID] + BK := &(*b.buckets)[op.bucketID] P := &b.points[op.pointID>>1] if P.IsInfinity() { return } // handle special cases with inf or -P / P - if B.IsInfinity() { + if BK.IsInfinity() { if op.isNeg() { - B.Neg(P) + BK.Neg(P) } else { - B.Set(P) + BK.Set(P) } return } if op.isNeg() { // if bucket == P --> -P == 0 - if B.Equal(P) { - B.setInfinity() + if BK.Equal(P) { + BK.setInfinity() return } } else { // if bucket == -P, B == 0 - if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { - B.setInfinity() + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() return } } // b.bucketIds[b.cptP] = op.bucketID b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = B + b.R[b.cptP] = BK if op.isNeg() { b.P[b.cptP].Neg(P) } else { @@ -290,7 +332,7 @@ func (b *BatchG1Affine) Add(op batchOp) { b.cptP++ } -func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { +func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { for i := len(queue) - 1; i >= 0; i-- { if batch.CanAdd(queue[i].bucketID) { batch.Add(queue[i]) @@ -305,16 +347,15 @@ func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { } -func msmProcessChunkG1AffineBatchAffine(chunk uint64, +func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, - buckets []G1Affine, c uint64, points []G1Affine, scalars []fr.Element) { mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) - + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -331,7 +372,7 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64, s.shiftHigh = (c - nbBitsHigh) } - batch := newBatchG1Affine(buckets, points) + batch := newBatchG1Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 for i := 0; i < len(scalars); i++ { @@ -400,11 +441,12 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64, } -func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -412,14 +454,25 @@ func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, split // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended + chChunks := make([]chan g1JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g1JacExtended, 1) } + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G1Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g1JacExtended, 1<<(lastC-1)) + // TODO @gbotrel lastC restore. + msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- + } + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -442,7 +495,106 @@ func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, split }() } - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) +} + +type bucketG1AffineC1 [1 << (1 - 1)]G1Affine +type bucketG1AffineC2 [1 << (2 - 1)]G1Affine +type bucketG1AffineC3 [1 << (3 - 1)]G1Affine +type bucketG1AffineC4 [1 << (4 - 1)]G1Affine +type bucketG1AffineC5 [1 << (5 - 1)]G1Affine +type bucketG1AffineC6 [1 << (6 - 1)]G1Affine +type bucketG1AffineC7 [1 << (7 - 1)]G1Affine +type bucketG1AffineC8 [1 << (8 - 1)]G1Affine +type bucketG1AffineC9 [1 << (9 - 1)]G1Affine +type bucketG1AffineC10 [1 << (10 - 1)]G1Affine +type bucketG1AffineC11 [1 << (11 - 1)]G1Affine +type bucketG1AffineC12 [1 << (12 - 1)]G1Affine +type bucketG1AffineC13 [1 << (13 - 1)]G1Affine +type bucketG1AffineC14 [1 << (14 - 1)]G1Affine +type bucketG1AffineC15 [1 << (15 - 1)]G1Affine +type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +type bucketG1AffineC17 [1 << (17 - 1)]G1Affine +type bucketG1AffineC18 [1 << (18 - 1)]G1Affine +type bucketG1AffineC19 [1 << (19 - 1)]G1Affine +type bucketG1AffineC20 [1 << (20 - 1)]G1Affine +type bucketG1AffineC21 [1 << (21 - 1)]G1Affine +type bucketG1AffineC22 [1 << (22 - 1)]G1Affine +type bucketG1AffineC23 [1 << (23 - 1)]G1Affine +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended +type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended +type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended +type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended +type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended +type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended +type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended +type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended +type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended +type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended +type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended +type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended +type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended +type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended +type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended +type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended +type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended +type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended +type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended + +type ibG1Affine interface { + bucketG1AffineC1 | + bucketG1AffineC2 | + bucketG1AffineC3 | + bucketG1AffineC4 | + bucketG1AffineC5 | + bucketG1AffineC6 | + bucketG1AffineC7 | + bucketG1AffineC8 | + bucketG1AffineC9 | + bucketG1AffineC10 | + bucketG1AffineC11 | + bucketG1AffineC12 | + bucketG1AffineC13 | + bucketG1AffineC14 | + bucketG1AffineC15 | + bucketG1AffineC16 | + bucketG1AffineC17 | + bucketG1AffineC18 | + bucketG1AffineC19 | + bucketG1AffineC20 | + bucketG1AffineC21 | + bucketG1AffineC22 | + bucketG1AffineC23 +} + +type ibg1JacExtended interface { + bucketg1JacExtendedC1 | + bucketg1JacExtendedC2 | + bucketg1JacExtendedC3 | + bucketg1JacExtendedC4 | + bucketg1JacExtendedC5 | + bucketg1JacExtendedC6 | + bucketg1JacExtendedC7 | + bucketg1JacExtendedC8 | + bucketg1JacExtendedC9 | + bucketg1JacExtendedC10 | + bucketg1JacExtendedC11 | + bucketg1JacExtendedC12 | + bucketg1JacExtendedC13 | + bucketg1JacExtendedC14 | + bucketg1JacExtendedC15 | + bucketg1JacExtendedC16 | + bucketg1JacExtendedC17 | + bucketg1JacExtendedC18 | + bucketg1JacExtendedC19 | + bucketg1JacExtendedC20 | + bucketg1JacExtendedC21 | + bucketg1JacExtendedC22 | + bucketg1JacExtendedC23 } // MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf @@ -504,7 +656,7 @@ func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, con // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 16} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -577,69 +729,111 @@ func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.E switch c { + case 1: + msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC2](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC5](p, 5, points, scalars, splitFirstChunk) + + case 6: + msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC2](p, 6, points, scalars, splitFirstChunk) + + case 7: + msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC5](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) + + case 9: + msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC5](p, 9, points, scalars, splitFirstChunk) + + case 10: + batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC10](p, 10, points, scalars, splitFirstChunk) + + case 11: + batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC1](p, 11, points, scalars, splitFirstChunk) + + case 12: + batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC8](p, 12, points, scalars, splitFirstChunk) + + case 13: + batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC8](p, 13, points, scalars, splitFirstChunk) + + case 14: + batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC12](p, 14, points, scalars, splitFirstChunk) + + case 15: + batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC5](p, 15, points, scalars, splitFirstChunk) case 16: - p.batchAffineMsmC16(points, scalars, splitFirstChunk) + batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC14](p, 17, points, scalars, splitFirstChunk) + + case 18: + batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC14](p, 18, points, scalars, splitFirstChunk) + + case 19: + batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC16](p, 19, points, scalars, splitFirstChunk) + + case 20: + batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC20](p, 20, points, scalars, splitFirstChunk) + + case 21: + batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC5](p, 21, points, scalars, splitFirstChunk) + + case 22: + batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC12](p, 22, points, scalars, splitFirstChunk) + + case 23: + batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC21](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") } } -// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { - var _p g2JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -type BatchG2Affine struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - buckets, points []G2Affine +type BatchG2Affine[B ibG2Affine] struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G2Affine + buckets *B } -func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine { - batchSize := len(buckets) / 5 +func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { + batchSize := len(*buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG2Affine{ + return BatchG2Affine[B]{ buckets: buckets, points: points, batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(buckets)/2), + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), } } -func (b *BatchG2Affine) IsFull() bool { +func (b *BatchG2Affine[B]) IsFull() bool { return b.cptP == b.batchSize } -func (b *BatchG2Affine) ExecuteAndReset() { +func (b *BatchG2Affine[B]) ExecuteAndReset() { if b.cptP == 0 { return } @@ -654,45 +848,45 @@ func (b *BatchG2Affine) ExecuteAndReset() { b.cptP = 0 } -func (b *BatchG2Affine) CanAdd(bID uint32) bool { +func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { _, ok := b.bucketIds[bID] return !ok } -func (b *BatchG2Affine) Add(op batchOp) { +func (b *BatchG2Affine[B]) Add(op batchOp) { // CanAdd must be called before --> ensures bucket is not "used" in current batch - B := &b.buckets[op.bucketID] + BK := &(*b.buckets)[op.bucketID] P := &b.points[op.pointID>>1] if P.IsInfinity() { return } // handle special cases with inf or -P / P - if B.IsInfinity() { + if BK.IsInfinity() { if op.isNeg() { - B.Neg(P) + BK.Neg(P) } else { - B.Set(P) + BK.Set(P) } return } if op.isNeg() { // if bucket == P --> -P == 0 - if B.Equal(P) { - B.setInfinity() + if BK.Equal(P) { + BK.setInfinity() return } } else { // if bucket == -P, B == 0 - if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { - B.setInfinity() + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() return } } // b.bucketIds[b.cptP] = op.bucketID b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = B + b.R[b.cptP] = BK if op.isNeg() { b.P[b.cptP].Neg(P) } else { @@ -701,7 +895,7 @@ func (b *BatchG2Affine) Add(op batchOp) { b.cptP++ } -func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { +func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { for i := len(queue) - 1; i >= 0; i-- { if batch.CanAdd(queue[i].bucketID) { batch.Add(queue[i]) @@ -716,16 +910,15 @@ func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { } -func msmProcessChunkG2AffineBatchAffine(chunk uint64, +func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, chRes chan<- g2JacExtended, - buckets []G2Affine, c uint64, points []G2Affine, scalars []fr.Element) { mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) - + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -742,7 +935,7 @@ func msmProcessChunkG2AffineBatchAffine(chunk uint64, s.shiftHigh = (c - nbBitsHigh) } - batch := newBatchG2Affine(buckets, points) + batch := newBatchG2Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 for i := 0; i < len(scalars); i++ { @@ -811,11 +1004,12 @@ func msmProcessChunkG2AffineBatchAffine(chunk uint64, } -func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -823,14 +1017,25 @@ func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, split // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended + chChunks := make([]chan g2JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g2JacExtended, 1) } + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G2Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g2JacExtended, 1<<(lastC-1)) + // TODO @gbotrel lastC restore. + msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- + } + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -853,5 +1058,104 @@ func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, split }() } - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) +} + +type bucketG2AffineC1 [1 << (1 - 1)]G2Affine +type bucketG2AffineC2 [1 << (2 - 1)]G2Affine +type bucketG2AffineC3 [1 << (3 - 1)]G2Affine +type bucketG2AffineC4 [1 << (4 - 1)]G2Affine +type bucketG2AffineC5 [1 << (5 - 1)]G2Affine +type bucketG2AffineC6 [1 << (6 - 1)]G2Affine +type bucketG2AffineC7 [1 << (7 - 1)]G2Affine +type bucketG2AffineC8 [1 << (8 - 1)]G2Affine +type bucketG2AffineC9 [1 << (9 - 1)]G2Affine +type bucketG2AffineC10 [1 << (10 - 1)]G2Affine +type bucketG2AffineC11 [1 << (11 - 1)]G2Affine +type bucketG2AffineC12 [1 << (12 - 1)]G2Affine +type bucketG2AffineC13 [1 << (13 - 1)]G2Affine +type bucketG2AffineC14 [1 << (14 - 1)]G2Affine +type bucketG2AffineC15 [1 << (15 - 1)]G2Affine +type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +type bucketG2AffineC17 [1 << (17 - 1)]G2Affine +type bucketG2AffineC18 [1 << (18 - 1)]G2Affine +type bucketG2AffineC19 [1 << (19 - 1)]G2Affine +type bucketG2AffineC20 [1 << (20 - 1)]G2Affine +type bucketG2AffineC21 [1 << (21 - 1)]G2Affine +type bucketG2AffineC22 [1 << (22 - 1)]G2Affine +type bucketG2AffineC23 [1 << (23 - 1)]G2Affine +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended +type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended +type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended +type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended +type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended +type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended +type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended +type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended +type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended +type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended +type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended +type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended +type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended +type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended +type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended +type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended +type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended +type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended +type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended + +type ibG2Affine interface { + bucketG2AffineC1 | + bucketG2AffineC2 | + bucketG2AffineC3 | + bucketG2AffineC4 | + bucketG2AffineC5 | + bucketG2AffineC6 | + bucketG2AffineC7 | + bucketG2AffineC8 | + bucketG2AffineC9 | + bucketG2AffineC10 | + bucketG2AffineC11 | + bucketG2AffineC12 | + bucketG2AffineC13 | + bucketG2AffineC14 | + bucketG2AffineC15 | + bucketG2AffineC16 | + bucketG2AffineC17 | + bucketG2AffineC18 | + bucketG2AffineC19 | + bucketG2AffineC20 | + bucketG2AffineC21 | + bucketG2AffineC22 | + bucketG2AffineC23 +} + +type ibg2JacExtended interface { + bucketg2JacExtendedC1 | + bucketg2JacExtendedC2 | + bucketg2JacExtendedC3 | + bucketg2JacExtendedC4 | + bucketg2JacExtendedC5 | + bucketg2JacExtendedC6 | + bucketg2JacExtendedC7 | + bucketg2JacExtendedC8 | + bucketg2JacExtendedC9 | + bucketg2JacExtendedC10 | + bucketg2JacExtendedC11 | + bucketg2JacExtendedC12 | + bucketg2JacExtendedC13 | + bucketg2JacExtendedC14 | + bucketg2JacExtendedC15 | + bucketg2JacExtendedC16 | + bucketg2JacExtendedC17 | + bucketg2JacExtendedC18 | + bucketg2JacExtendedC19 | + bucketg2JacExtendedC20 | + bucketg2JacExtendedC21 | + bucketg2JacExtendedC22 | + bucketg2JacExtendedC23 } diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index 282b60e573..32d3f4e986 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - r16.msmC16(samplePoints[:], scalars16, true) + msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 8, 16} + cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 5; i <= pow; i++ { + for i := 11; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - r16.msmC16(samplePoints[:], scalars16, true) + msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 5; i <= pow; i++ { + for i := 11; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index 698835f344..2ab93e7fd9 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -221,7 +221,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 16} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -294,17 +294,74 @@ func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, spl switch c { + case 1: + msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC3](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC4](p, 5, points, scalars, splitFirstChunk) + + case 6: + msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC6](p, 6, points, scalars, splitFirstChunk) + + case 7: + msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC6](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) + + case 9: + msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC6](p, 9, points, scalars, splitFirstChunk) + + case 10: + msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC4](p, 10, points, scalars, splitFirstChunk) + + case 11: + msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC10](p, 11, points, scalars, splitFirstChunk) + + case 12: + msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC12](p, 12, points, scalars, splitFirstChunk) + + case 13: + msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC7](p, 13, points, scalars, splitFirstChunk) + + case 14: + msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC6](p, 14, points, scalars, splitFirstChunk) + + case 15: + msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC9](p, 15, points, scalars, splitFirstChunk) case 16: - p.msmC16(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC10](p, 17, points, scalars, splitFirstChunk) + + case 18: + msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC6](p, 18, points, scalars, splitFirstChunk) + + case 19: + msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC4](p, 19, points, scalars, splitFirstChunk) + + case 20: + msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC4](p, 20, points, scalars, splitFirstChunk) + + case 21: + msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC6](p, 21, points, scalars, splitFirstChunk) + + case 22: + msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC10](p, 22, points, scalars, splitFirstChunk) + + case 23: + msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC16](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") @@ -327,9 +384,8 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG1Affine(chunk uint64, +func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, - buckets []g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element) { @@ -337,6 +393,7 @@ func msmProcessChunkG1Affine(chunk uint64, mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -391,168 +448,36 @@ func msmProcessChunkG1Affine(chunk uint64, } -func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 4 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 5 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 384, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 8 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended + chChunks := make([]chan g1JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g1JacExtended, 1) } - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G1Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g1JacExtended, 1<<(lastC-1)) + // TODO @gbotrel last C restore. + msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -575,7 +500,7 @@ func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk }() } - return msmReduceChunkG1Affine(p, c, chChunks[:]) + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf @@ -637,7 +562,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 16} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -710,17 +635,74 @@ func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, spl switch c { + case 1: + msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC3](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC4](p, 5, points, scalars, splitFirstChunk) + + case 6: + msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC6](p, 6, points, scalars, splitFirstChunk) + + case 7: + msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC6](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) + + case 9: + msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC6](p, 9, points, scalars, splitFirstChunk) + + case 10: + msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC4](p, 10, points, scalars, splitFirstChunk) + + case 11: + msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC10](p, 11, points, scalars, splitFirstChunk) + + case 12: + msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC12](p, 12, points, scalars, splitFirstChunk) + + case 13: + msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC7](p, 13, points, scalars, splitFirstChunk) + + case 14: + msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC6](p, 14, points, scalars, splitFirstChunk) + + case 15: + msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC9](p, 15, points, scalars, splitFirstChunk) case 16: - p.msmC16(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC10](p, 17, points, scalars, splitFirstChunk) + + case 18: + msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC6](p, 18, points, scalars, splitFirstChunk) + + case 19: + msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC4](p, 19, points, scalars, splitFirstChunk) + + case 20: + msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC4](p, 20, points, scalars, splitFirstChunk) + + case 21: + msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC6](p, 21, points, scalars, splitFirstChunk) + + case 22: + msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC10](p, 22, points, scalars, splitFirstChunk) + + case 23: + msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC16](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") @@ -743,9 +725,8 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG2Affine(chunk uint64, +func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, - buckets []g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element) { @@ -753,6 +734,7 @@ func msmProcessChunkG2Affine(chunk uint64, mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -807,168 +789,36 @@ func msmProcessChunkG2Affine(chunk uint64, } -func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 4 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 5 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 384, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 8 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended + chChunks := make([]chan g2JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g2JacExtended, 1) } - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G2Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g2JacExtended, 1<<(lastC-1)) + // TODO @gbotrel last C restore. + msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -991,5 +841,5 @@ func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk }() } - return msmReduceChunkG2Affine(p, c, chChunks[:]) + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index 427f4d3891..3b533e9059 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -93,7 +93,7 @@ func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, con // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 16} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -166,69 +166,111 @@ func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.E switch c { + case 1: + msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC3](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC4](p, 5, points, scalars, splitFirstChunk) + + case 6: + msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC6](p, 6, points, scalars, splitFirstChunk) + + case 7: + msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC6](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) + + case 9: + msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC6](p, 9, points, scalars, splitFirstChunk) + + case 10: + batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC4](p, 10, points, scalars, splitFirstChunk) + + case 11: + batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC10](p, 11, points, scalars, splitFirstChunk) + + case 12: + batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC12](p, 12, points, scalars, splitFirstChunk) + + case 13: + batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC7](p, 13, points, scalars, splitFirstChunk) + + case 14: + batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC6](p, 14, points, scalars, splitFirstChunk) + + case 15: + batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC9](p, 15, points, scalars, splitFirstChunk) case 16: - p.batchAffineMsmC16(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC10](p, 17, points, scalars, splitFirstChunk) + + case 18: + batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC6](p, 18, points, scalars, splitFirstChunk) + + case 19: + batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC4](p, 19, points, scalars, splitFirstChunk) + + case 20: + batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC4](p, 20, points, scalars, splitFirstChunk) + + case 21: + batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC6](p, 21, points, scalars, splitFirstChunk) + + case 22: + batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC10](p, 22, points, scalars, splitFirstChunk) + + case 23: + batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC16](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") } } -// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { - var _p g1JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -type BatchG1Affine struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - buckets, points []G1Affine +type BatchG1Affine[B ibG1Affine] struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G1Affine + buckets *B } -func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine { - batchSize := len(buckets) / 5 +func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { + batchSize := len(*buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG1Affine{ + return BatchG1Affine[B]{ buckets: buckets, points: points, batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(buckets)/2), + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), } } -func (b *BatchG1Affine) IsFull() bool { +func (b *BatchG1Affine[B]) IsFull() bool { return b.cptP == b.batchSize } -func (b *BatchG1Affine) ExecuteAndReset() { +func (b *BatchG1Affine[B]) ExecuteAndReset() { if b.cptP == 0 { return } @@ -243,45 +285,45 @@ func (b *BatchG1Affine) ExecuteAndReset() { b.cptP = 0 } -func (b *BatchG1Affine) CanAdd(bID uint32) bool { +func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { _, ok := b.bucketIds[bID] return !ok } -func (b *BatchG1Affine) Add(op batchOp) { +func (b *BatchG1Affine[B]) Add(op batchOp) { // CanAdd must be called before --> ensures bucket is not "used" in current batch - B := &b.buckets[op.bucketID] + BK := &(*b.buckets)[op.bucketID] P := &b.points[op.pointID>>1] if P.IsInfinity() { return } // handle special cases with inf or -P / P - if B.IsInfinity() { + if BK.IsInfinity() { if op.isNeg() { - B.Neg(P) + BK.Neg(P) } else { - B.Set(P) + BK.Set(P) } return } if op.isNeg() { // if bucket == P --> -P == 0 - if B.Equal(P) { - B.setInfinity() + if BK.Equal(P) { + BK.setInfinity() return } } else { // if bucket == -P, B == 0 - if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { - B.setInfinity() + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() return } } // b.bucketIds[b.cptP] = op.bucketID b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = B + b.R[b.cptP] = BK if op.isNeg() { b.P[b.cptP].Neg(P) } else { @@ -290,7 +332,7 @@ func (b *BatchG1Affine) Add(op batchOp) { b.cptP++ } -func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { +func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { for i := len(queue) - 1; i >= 0; i-- { if batch.CanAdd(queue[i].bucketID) { batch.Add(queue[i]) @@ -305,16 +347,15 @@ func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { } -func msmProcessChunkG1AffineBatchAffine(chunk uint64, +func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, - buckets []G1Affine, c uint64, points []G1Affine, scalars []fr.Element) { mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) - + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -331,7 +372,7 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64, s.shiftHigh = (c - nbBitsHigh) } - batch := newBatchG1Affine(buckets, points) + batch := newBatchG1Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 for i := 0; i < len(scalars); i++ { @@ -400,11 +441,12 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64, } -func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -412,14 +454,25 @@ func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, split // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended + chChunks := make([]chan g1JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g1JacExtended, 1) } + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G1Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g1JacExtended, 1<<(lastC-1)) + // TODO @gbotrel lastC restore. + msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- + } + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -442,7 +495,106 @@ func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, split }() } - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) +} + +type bucketG1AffineC1 [1 << (1 - 1)]G1Affine +type bucketG1AffineC2 [1 << (2 - 1)]G1Affine +type bucketG1AffineC3 [1 << (3 - 1)]G1Affine +type bucketG1AffineC4 [1 << (4 - 1)]G1Affine +type bucketG1AffineC5 [1 << (5 - 1)]G1Affine +type bucketG1AffineC6 [1 << (6 - 1)]G1Affine +type bucketG1AffineC7 [1 << (7 - 1)]G1Affine +type bucketG1AffineC8 [1 << (8 - 1)]G1Affine +type bucketG1AffineC9 [1 << (9 - 1)]G1Affine +type bucketG1AffineC10 [1 << (10 - 1)]G1Affine +type bucketG1AffineC11 [1 << (11 - 1)]G1Affine +type bucketG1AffineC12 [1 << (12 - 1)]G1Affine +type bucketG1AffineC13 [1 << (13 - 1)]G1Affine +type bucketG1AffineC14 [1 << (14 - 1)]G1Affine +type bucketG1AffineC15 [1 << (15 - 1)]G1Affine +type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +type bucketG1AffineC17 [1 << (17 - 1)]G1Affine +type bucketG1AffineC18 [1 << (18 - 1)]G1Affine +type bucketG1AffineC19 [1 << (19 - 1)]G1Affine +type bucketG1AffineC20 [1 << (20 - 1)]G1Affine +type bucketG1AffineC21 [1 << (21 - 1)]G1Affine +type bucketG1AffineC22 [1 << (22 - 1)]G1Affine +type bucketG1AffineC23 [1 << (23 - 1)]G1Affine +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended +type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended +type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended +type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended +type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended +type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended +type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended +type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended +type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended +type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended +type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended +type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended +type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended +type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended +type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended +type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended +type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended +type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended +type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended + +type ibG1Affine interface { + bucketG1AffineC1 | + bucketG1AffineC2 | + bucketG1AffineC3 | + bucketG1AffineC4 | + bucketG1AffineC5 | + bucketG1AffineC6 | + bucketG1AffineC7 | + bucketG1AffineC8 | + bucketG1AffineC9 | + bucketG1AffineC10 | + bucketG1AffineC11 | + bucketG1AffineC12 | + bucketG1AffineC13 | + bucketG1AffineC14 | + bucketG1AffineC15 | + bucketG1AffineC16 | + bucketG1AffineC17 | + bucketG1AffineC18 | + bucketG1AffineC19 | + bucketG1AffineC20 | + bucketG1AffineC21 | + bucketG1AffineC22 | + bucketG1AffineC23 +} + +type ibg1JacExtended interface { + bucketg1JacExtendedC1 | + bucketg1JacExtendedC2 | + bucketg1JacExtendedC3 | + bucketg1JacExtendedC4 | + bucketg1JacExtendedC5 | + bucketg1JacExtendedC6 | + bucketg1JacExtendedC7 | + bucketg1JacExtendedC8 | + bucketg1JacExtendedC9 | + bucketg1JacExtendedC10 | + bucketg1JacExtendedC11 | + bucketg1JacExtendedC12 | + bucketg1JacExtendedC13 | + bucketg1JacExtendedC14 | + bucketg1JacExtendedC15 | + bucketg1JacExtendedC16 | + bucketg1JacExtendedC17 | + bucketg1JacExtendedC18 | + bucketg1JacExtendedC19 | + bucketg1JacExtendedC20 | + bucketg1JacExtendedC21 | + bucketg1JacExtendedC22 | + bucketg1JacExtendedC23 } // MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf @@ -504,7 +656,7 @@ func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, con // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 16} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -577,69 +729,111 @@ func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.E switch c { + case 1: + msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC3](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC4](p, 5, points, scalars, splitFirstChunk) + + case 6: + msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC6](p, 6, points, scalars, splitFirstChunk) + + case 7: + msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC6](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) + + case 9: + msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC6](p, 9, points, scalars, splitFirstChunk) + + case 10: + batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC4](p, 10, points, scalars, splitFirstChunk) + + case 11: + batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC10](p, 11, points, scalars, splitFirstChunk) + + case 12: + batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC12](p, 12, points, scalars, splitFirstChunk) + + case 13: + batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC7](p, 13, points, scalars, splitFirstChunk) + + case 14: + batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC6](p, 14, points, scalars, splitFirstChunk) + + case 15: + batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC9](p, 15, points, scalars, splitFirstChunk) case 16: - p.batchAffineMsmC16(points, scalars, splitFirstChunk) + batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC10](p, 17, points, scalars, splitFirstChunk) + + case 18: + batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC6](p, 18, points, scalars, splitFirstChunk) + + case 19: + batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC4](p, 19, points, scalars, splitFirstChunk) + + case 20: + batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC4](p, 20, points, scalars, splitFirstChunk) + + case 21: + batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC6](p, 21, points, scalars, splitFirstChunk) + + case 22: + batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC10](p, 22, points, scalars, splitFirstChunk) + + case 23: + batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC16](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") } } -// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { - var _p g2JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -type BatchG2Affine struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - buckets, points []G2Affine +type BatchG2Affine[B ibG2Affine] struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G2Affine + buckets *B } -func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine { - batchSize := len(buckets) / 5 +func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { + batchSize := len(*buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG2Affine{ + return BatchG2Affine[B]{ buckets: buckets, points: points, batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(buckets)/2), + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), } } -func (b *BatchG2Affine) IsFull() bool { +func (b *BatchG2Affine[B]) IsFull() bool { return b.cptP == b.batchSize } -func (b *BatchG2Affine) ExecuteAndReset() { +func (b *BatchG2Affine[B]) ExecuteAndReset() { if b.cptP == 0 { return } @@ -654,45 +848,45 @@ func (b *BatchG2Affine) ExecuteAndReset() { b.cptP = 0 } -func (b *BatchG2Affine) CanAdd(bID uint32) bool { +func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { _, ok := b.bucketIds[bID] return !ok } -func (b *BatchG2Affine) Add(op batchOp) { +func (b *BatchG2Affine[B]) Add(op batchOp) { // CanAdd must be called before --> ensures bucket is not "used" in current batch - B := &b.buckets[op.bucketID] + BK := &(*b.buckets)[op.bucketID] P := &b.points[op.pointID>>1] if P.IsInfinity() { return } // handle special cases with inf or -P / P - if B.IsInfinity() { + if BK.IsInfinity() { if op.isNeg() { - B.Neg(P) + BK.Neg(P) } else { - B.Set(P) + BK.Set(P) } return } if op.isNeg() { // if bucket == P --> -P == 0 - if B.Equal(P) { - B.setInfinity() + if BK.Equal(P) { + BK.setInfinity() return } } else { // if bucket == -P, B == 0 - if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { - B.setInfinity() + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() return } } // b.bucketIds[b.cptP] = op.bucketID b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = B + b.R[b.cptP] = BK if op.isNeg() { b.P[b.cptP].Neg(P) } else { @@ -701,7 +895,7 @@ func (b *BatchG2Affine) Add(op batchOp) { b.cptP++ } -func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { +func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { for i := len(queue) - 1; i >= 0; i-- { if batch.CanAdd(queue[i].bucketID) { batch.Add(queue[i]) @@ -716,16 +910,15 @@ func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { } -func msmProcessChunkG2AffineBatchAffine(chunk uint64, +func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, chRes chan<- g2JacExtended, - buckets []G2Affine, c uint64, points []G2Affine, scalars []fr.Element) { mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) - + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -742,7 +935,7 @@ func msmProcessChunkG2AffineBatchAffine(chunk uint64, s.shiftHigh = (c - nbBitsHigh) } - batch := newBatchG2Affine(buckets, points) + batch := newBatchG2Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 for i := 0; i < len(scalars); i++ { @@ -811,11 +1004,12 @@ func msmProcessChunkG2AffineBatchAffine(chunk uint64, } -func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -823,14 +1017,25 @@ func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, split // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended + chChunks := make([]chan g2JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g2JacExtended, 1) } + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G2Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g2JacExtended, 1<<(lastC-1)) + // TODO @gbotrel lastC restore. + msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- + } + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -853,5 +1058,104 @@ func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, split }() } - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) +} + +type bucketG2AffineC1 [1 << (1 - 1)]G2Affine +type bucketG2AffineC2 [1 << (2 - 1)]G2Affine +type bucketG2AffineC3 [1 << (3 - 1)]G2Affine +type bucketG2AffineC4 [1 << (4 - 1)]G2Affine +type bucketG2AffineC5 [1 << (5 - 1)]G2Affine +type bucketG2AffineC6 [1 << (6 - 1)]G2Affine +type bucketG2AffineC7 [1 << (7 - 1)]G2Affine +type bucketG2AffineC8 [1 << (8 - 1)]G2Affine +type bucketG2AffineC9 [1 << (9 - 1)]G2Affine +type bucketG2AffineC10 [1 << (10 - 1)]G2Affine +type bucketG2AffineC11 [1 << (11 - 1)]G2Affine +type bucketG2AffineC12 [1 << (12 - 1)]G2Affine +type bucketG2AffineC13 [1 << (13 - 1)]G2Affine +type bucketG2AffineC14 [1 << (14 - 1)]G2Affine +type bucketG2AffineC15 [1 << (15 - 1)]G2Affine +type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +type bucketG2AffineC17 [1 << (17 - 1)]G2Affine +type bucketG2AffineC18 [1 << (18 - 1)]G2Affine +type bucketG2AffineC19 [1 << (19 - 1)]G2Affine +type bucketG2AffineC20 [1 << (20 - 1)]G2Affine +type bucketG2AffineC21 [1 << (21 - 1)]G2Affine +type bucketG2AffineC22 [1 << (22 - 1)]G2Affine +type bucketG2AffineC23 [1 << (23 - 1)]G2Affine +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended +type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended +type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended +type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended +type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended +type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended +type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended +type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended +type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended +type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended +type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended +type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended +type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended +type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended +type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended +type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended +type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended +type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended +type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended + +type ibG2Affine interface { + bucketG2AffineC1 | + bucketG2AffineC2 | + bucketG2AffineC3 | + bucketG2AffineC4 | + bucketG2AffineC5 | + bucketG2AffineC6 | + bucketG2AffineC7 | + bucketG2AffineC8 | + bucketG2AffineC9 | + bucketG2AffineC10 | + bucketG2AffineC11 | + bucketG2AffineC12 | + bucketG2AffineC13 | + bucketG2AffineC14 | + bucketG2AffineC15 | + bucketG2AffineC16 | + bucketG2AffineC17 | + bucketG2AffineC18 | + bucketG2AffineC19 | + bucketG2AffineC20 | + bucketG2AffineC21 | + bucketG2AffineC22 | + bucketG2AffineC23 +} + +type ibg2JacExtended interface { + bucketg2JacExtendedC1 | + bucketg2JacExtendedC2 | + bucketg2JacExtendedC3 | + bucketg2JacExtendedC4 | + bucketg2JacExtendedC5 | + bucketg2JacExtendedC6 | + bucketg2JacExtendedC7 | + bucketg2JacExtendedC8 | + bucketg2JacExtendedC9 | + bucketg2JacExtendedC10 | + bucketg2JacExtendedC11 | + bucketg2JacExtendedC12 | + bucketg2JacExtendedC13 | + bucketg2JacExtendedC14 | + bucketg2JacExtendedC15 | + bucketg2JacExtendedC16 | + bucketg2JacExtendedC17 | + bucketg2JacExtendedC18 | + bucketg2JacExtendedC19 | + bucketg2JacExtendedC20 | + bucketg2JacExtendedC21 | + bucketg2JacExtendedC22 | + bucketg2JacExtendedC23 } diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index d101a5c9a6..6cbf26cdfa 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - r16.msmC16(samplePoints[:], scalars16, true) + msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 8, 16} + cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 5; i <= pow; i++ { + for i := 11; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - r16.msmC16(samplePoints[:], scalars16, true) + msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 5; i <= pow; i++ { + for i := 11; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index e9cef54ee0..cfaef03004 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -221,7 +221,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 16} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -294,17 +294,74 @@ func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, spl switch c { + case 1: + msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC3](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC4](p, 5, points, scalars, splitFirstChunk) + + case 6: + msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC6](p, 6, points, scalars, splitFirstChunk) + + case 7: + msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC6](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) + + case 9: + msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC6](p, 9, points, scalars, splitFirstChunk) + + case 10: + msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC4](p, 10, points, scalars, splitFirstChunk) + + case 11: + msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC10](p, 11, points, scalars, splitFirstChunk) + + case 12: + msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC12](p, 12, points, scalars, splitFirstChunk) + + case 13: + msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC7](p, 13, points, scalars, splitFirstChunk) + + case 14: + msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC6](p, 14, points, scalars, splitFirstChunk) + + case 15: + msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC9](p, 15, points, scalars, splitFirstChunk) case 16: - p.msmC16(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC10](p, 17, points, scalars, splitFirstChunk) + + case 18: + msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC6](p, 18, points, scalars, splitFirstChunk) + + case 19: + msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC4](p, 19, points, scalars, splitFirstChunk) + + case 20: + msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC4](p, 20, points, scalars, splitFirstChunk) + + case 21: + msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC6](p, 21, points, scalars, splitFirstChunk) + + case 22: + msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC10](p, 22, points, scalars, splitFirstChunk) + + case 23: + msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC16](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") @@ -327,9 +384,8 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG1Affine(chunk uint64, +func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, - buckets []g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element) { @@ -337,6 +393,7 @@ func msmProcessChunkG1Affine(chunk uint64, mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -391,168 +448,36 @@ func msmProcessChunkG1Affine(chunk uint64, } -func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 4 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 5 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // c doesn't divide 384, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G1Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g1JacExtended - msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 8 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended + chChunks := make([]chan g1JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g1JacExtended, 1) } - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, c, chChunks[:]) -} - -func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G1Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g1JacExtended, 1<<(lastC-1)) + // TODO @gbotrel last C restore. + msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]g1JacExtended - msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -575,7 +500,7 @@ func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk }() } - return msmReduceChunkG1Affine(p, c, chChunks[:]) + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf @@ -637,7 +562,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 16} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -710,17 +635,74 @@ func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, spl switch c { + case 1: + msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC3](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC4](p, 5, points, scalars, splitFirstChunk) + + case 6: + msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC6](p, 6, points, scalars, splitFirstChunk) + + case 7: + msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC6](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) + + case 9: + msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC6](p, 9, points, scalars, splitFirstChunk) + + case 10: + msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC4](p, 10, points, scalars, splitFirstChunk) + + case 11: + msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC10](p, 11, points, scalars, splitFirstChunk) + + case 12: + msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC12](p, 12, points, scalars, splitFirstChunk) + + case 13: + msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC7](p, 13, points, scalars, splitFirstChunk) + + case 14: + msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC6](p, 14, points, scalars, splitFirstChunk) + + case 15: + msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC9](p, 15, points, scalars, splitFirstChunk) case 16: - p.msmC16(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC10](p, 17, points, scalars, splitFirstChunk) + + case 18: + msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC6](p, 18, points, scalars, splitFirstChunk) + + case 19: + msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC4](p, 19, points, scalars, splitFirstChunk) + + case 20: + msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC4](p, 20, points, scalars, splitFirstChunk) + + case 21: + msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC6](p, 21, points, scalars, splitFirstChunk) + + case 22: + msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC10](p, 22, points, scalars, splitFirstChunk) + + case 23: + msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC16](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") @@ -743,9 +725,8 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG2Affine(chunk uint64, +func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, - buckets []g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element) { @@ -753,6 +734,7 @@ func msmProcessChunkG2Affine(chunk uint64, mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -807,168 +789,36 @@ func msmProcessChunkG2Affine(chunk uint64, } -func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 4 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 5 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks + 1]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // c doesn't divide 384, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - go func(j uint64, points []G2Affine, scalars []fr.Element) { - var buckets [1 << (lastC - 1)]g2JacExtended - msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() +func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 8 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended + chChunks := make([]chan g2JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g2JacExtended, 1) } - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, c, chChunks[:]) -} - -func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G2Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g2JacExtended, 1<<(lastC-1)) + // TODO @gbotrel last C restore. + msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- } processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]g2JacExtended - msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -991,5 +841,5 @@ func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk }() } - return msmReduceChunkG2Affine(p, c, chChunks[:]) + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index 09004ae309..72d199f31f 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -93,7 +93,7 @@ func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, con // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 16} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -166,69 +166,111 @@ func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.E switch c { + case 1: + msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC3](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC4](p, 5, points, scalars, splitFirstChunk) + + case 6: + msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC6](p, 6, points, scalars, splitFirstChunk) + + case 7: + msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC6](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) + + case 9: + msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC6](p, 9, points, scalars, splitFirstChunk) + + case 10: + batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC4](p, 10, points, scalars, splitFirstChunk) + + case 11: + batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC10](p, 11, points, scalars, splitFirstChunk) + + case 12: + batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC12](p, 12, points, scalars, splitFirstChunk) + + case 13: + batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC7](p, 13, points, scalars, splitFirstChunk) + + case 14: + batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC6](p, 14, points, scalars, splitFirstChunk) + + case 15: + batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC9](p, 15, points, scalars, splitFirstChunk) case 16: - p.batchAffineMsmC16(points, scalars, splitFirstChunk) + batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC10](p, 17, points, scalars, splitFirstChunk) + + case 18: + batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC6](p, 18, points, scalars, splitFirstChunk) + + case 19: + batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC4](p, 19, points, scalars, splitFirstChunk) + + case 20: + batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC4](p, 20, points, scalars, splitFirstChunk) + + case 21: + batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC6](p, 21, points, scalars, splitFirstChunk) + + case 22: + batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC10](p, 22, points, scalars, splitFirstChunk) + + case 23: + batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC16](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") } } -// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { - var _p g1JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -type BatchG1Affine struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - buckets, points []G1Affine +type BatchG1Affine[B ibG1Affine] struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G1Affine + buckets *B } -func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine { - batchSize := len(buckets) / 5 +func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { + batchSize := len(*buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG1Affine{ + return BatchG1Affine[B]{ buckets: buckets, points: points, batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(buckets)/2), + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), } } -func (b *BatchG1Affine) IsFull() bool { +func (b *BatchG1Affine[B]) IsFull() bool { return b.cptP == b.batchSize } -func (b *BatchG1Affine) ExecuteAndReset() { +func (b *BatchG1Affine[B]) ExecuteAndReset() { if b.cptP == 0 { return } @@ -243,45 +285,45 @@ func (b *BatchG1Affine) ExecuteAndReset() { b.cptP = 0 } -func (b *BatchG1Affine) CanAdd(bID uint32) bool { +func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { _, ok := b.bucketIds[bID] return !ok } -func (b *BatchG1Affine) Add(op batchOp) { +func (b *BatchG1Affine[B]) Add(op batchOp) { // CanAdd must be called before --> ensures bucket is not "used" in current batch - B := &b.buckets[op.bucketID] + BK := &(*b.buckets)[op.bucketID] P := &b.points[op.pointID>>1] if P.IsInfinity() { return } // handle special cases with inf or -P / P - if B.IsInfinity() { + if BK.IsInfinity() { if op.isNeg() { - B.Neg(P) + BK.Neg(P) } else { - B.Set(P) + BK.Set(P) } return } if op.isNeg() { // if bucket == P --> -P == 0 - if B.Equal(P) { - B.setInfinity() + if BK.Equal(P) { + BK.setInfinity() return } } else { // if bucket == -P, B == 0 - if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { - B.setInfinity() + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() return } } // b.bucketIds[b.cptP] = op.bucketID b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = B + b.R[b.cptP] = BK if op.isNeg() { b.P[b.cptP].Neg(P) } else { @@ -290,7 +332,7 @@ func (b *BatchG1Affine) Add(op batchOp) { b.cptP++ } -func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { +func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { for i := len(queue) - 1; i >= 0; i-- { if batch.CanAdd(queue[i].bucketID) { batch.Add(queue[i]) @@ -305,16 +347,15 @@ func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp { } -func msmProcessChunkG1AffineBatchAffine(chunk uint64, +func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, - buckets []G1Affine, c uint64, points []G1Affine, scalars []fr.Element) { mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) - + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -331,7 +372,7 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64, s.shiftHigh = (c - nbBitsHigh) } - batch := newBatchG1Affine(buckets, points) + batch := newBatchG1Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 for i := 0; i < len(scalars); i++ { @@ -400,11 +441,12 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64, } -func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { + + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -412,14 +454,25 @@ func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, split // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g1JacExtended + chChunks := make([]chan g1JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g1JacExtended, 1) } + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G1Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g1JacExtended, 1<<(lastC-1)) + // TODO @gbotrel lastC restore. + msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- + } + processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - var buckets [1 << (c - 1)]G1Affine - msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -442,7 +495,106 @@ func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, split }() } - return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:]) + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) +} + +type bucketG1AffineC1 [1 << (1 - 1)]G1Affine +type bucketG1AffineC2 [1 << (2 - 1)]G1Affine +type bucketG1AffineC3 [1 << (3 - 1)]G1Affine +type bucketG1AffineC4 [1 << (4 - 1)]G1Affine +type bucketG1AffineC5 [1 << (5 - 1)]G1Affine +type bucketG1AffineC6 [1 << (6 - 1)]G1Affine +type bucketG1AffineC7 [1 << (7 - 1)]G1Affine +type bucketG1AffineC8 [1 << (8 - 1)]G1Affine +type bucketG1AffineC9 [1 << (9 - 1)]G1Affine +type bucketG1AffineC10 [1 << (10 - 1)]G1Affine +type bucketG1AffineC11 [1 << (11 - 1)]G1Affine +type bucketG1AffineC12 [1 << (12 - 1)]G1Affine +type bucketG1AffineC13 [1 << (13 - 1)]G1Affine +type bucketG1AffineC14 [1 << (14 - 1)]G1Affine +type bucketG1AffineC15 [1 << (15 - 1)]G1Affine +type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +type bucketG1AffineC17 [1 << (17 - 1)]G1Affine +type bucketG1AffineC18 [1 << (18 - 1)]G1Affine +type bucketG1AffineC19 [1 << (19 - 1)]G1Affine +type bucketG1AffineC20 [1 << (20 - 1)]G1Affine +type bucketG1AffineC21 [1 << (21 - 1)]G1Affine +type bucketG1AffineC22 [1 << (22 - 1)]G1Affine +type bucketG1AffineC23 [1 << (23 - 1)]G1Affine +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended +type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended +type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended +type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended +type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended +type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended +type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended +type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended +type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended +type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended +type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended +type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended +type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended +type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended +type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended +type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended +type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended +type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended +type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended + +type ibG1Affine interface { + bucketG1AffineC1 | + bucketG1AffineC2 | + bucketG1AffineC3 | + bucketG1AffineC4 | + bucketG1AffineC5 | + bucketG1AffineC6 | + bucketG1AffineC7 | + bucketG1AffineC8 | + bucketG1AffineC9 | + bucketG1AffineC10 | + bucketG1AffineC11 | + bucketG1AffineC12 | + bucketG1AffineC13 | + bucketG1AffineC14 | + bucketG1AffineC15 | + bucketG1AffineC16 | + bucketG1AffineC17 | + bucketG1AffineC18 | + bucketG1AffineC19 | + bucketG1AffineC20 | + bucketG1AffineC21 | + bucketG1AffineC22 | + bucketG1AffineC23 +} + +type ibg1JacExtended interface { + bucketg1JacExtendedC1 | + bucketg1JacExtendedC2 | + bucketg1JacExtendedC3 | + bucketg1JacExtendedC4 | + bucketg1JacExtendedC5 | + bucketg1JacExtendedC6 | + bucketg1JacExtendedC7 | + bucketg1JacExtendedC8 | + bucketg1JacExtendedC9 | + bucketg1JacExtendedC10 | + bucketg1JacExtendedC11 | + bucketg1JacExtendedC12 | + bucketg1JacExtendedC13 | + bucketg1JacExtendedC14 | + bucketg1JacExtendedC15 | + bucketg1JacExtendedC16 | + bucketg1JacExtendedC17 | + bucketg1JacExtendedC18 | + bucketg1JacExtendedC19 | + bucketg1JacExtendedC20 | + bucketg1JacExtendedC21 | + bucketg1JacExtendedC22 | + bucketg1JacExtendedC23 } // MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf @@ -504,7 +656,7 @@ func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, con // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 16} + implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -577,69 +729,111 @@ func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.E switch c { + case 1: + msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) + + case 2: + msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) + + case 3: + msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC3](p, 3, points, scalars, splitFirstChunk) + case 4: - p.msmC4(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) case 5: - p.msmC5(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC4](p, 5, points, scalars, splitFirstChunk) + + case 6: + msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC6](p, 6, points, scalars, splitFirstChunk) + + case 7: + msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC6](p, 7, points, scalars, splitFirstChunk) case 8: - p.msmC8(points, scalars, splitFirstChunk) + msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) + + case 9: + msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC6](p, 9, points, scalars, splitFirstChunk) + + case 10: + batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC4](p, 10, points, scalars, splitFirstChunk) + + case 11: + batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC10](p, 11, points, scalars, splitFirstChunk) + + case 12: + batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC12](p, 12, points, scalars, splitFirstChunk) + + case 13: + batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC7](p, 13, points, scalars, splitFirstChunk) + + case 14: + batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC6](p, 14, points, scalars, splitFirstChunk) + + case 15: + batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC9](p, 15, points, scalars, splitFirstChunk) case 16: - p.batchAffineMsmC16(points, scalars, splitFirstChunk) + batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + + case 17: + batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC10](p, 17, points, scalars, splitFirstChunk) + + case 18: + batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC6](p, 18, points, scalars, splitFirstChunk) + + case 19: + batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC4](p, 19, points, scalars, splitFirstChunk) + + case 20: + batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC4](p, 20, points, scalars, splitFirstChunk) + + case 21: + batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC6](p, 21, points, scalars, splitFirstChunk) + + case 22: + batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC10](p, 22, points, scalars, splitFirstChunk) + + case 23: + batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC16](p, 23, points, scalars, splitFirstChunk) default: panic("not implemented") } } -// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { - var _p g2JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -type BatchG2Affine struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - buckets, points []G2Affine +type BatchG2Affine[B ibG2Affine] struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G2Affine + buckets *B } -func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine { - batchSize := len(buckets) / 5 +func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { + batchSize := len(*buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG2Affine{ + return BatchG2Affine[B]{ buckets: buckets, points: points, batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(buckets)/2), + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), } } -func (b *BatchG2Affine) IsFull() bool { +func (b *BatchG2Affine[B]) IsFull() bool { return b.cptP == b.batchSize } -func (b *BatchG2Affine) ExecuteAndReset() { +func (b *BatchG2Affine[B]) ExecuteAndReset() { if b.cptP == 0 { return } @@ -654,45 +848,45 @@ func (b *BatchG2Affine) ExecuteAndReset() { b.cptP = 0 } -func (b *BatchG2Affine) CanAdd(bID uint32) bool { +func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { _, ok := b.bucketIds[bID] return !ok } -func (b *BatchG2Affine) Add(op batchOp) { +func (b *BatchG2Affine[B]) Add(op batchOp) { // CanAdd must be called before --> ensures bucket is not "used" in current batch - B := &b.buckets[op.bucketID] + BK := &(*b.buckets)[op.bucketID] P := &b.points[op.pointID>>1] if P.IsInfinity() { return } // handle special cases with inf or -P / P - if B.IsInfinity() { + if BK.IsInfinity() { if op.isNeg() { - B.Neg(P) + BK.Neg(P) } else { - B.Set(P) + BK.Set(P) } return } if op.isNeg() { // if bucket == P --> -P == 0 - if B.Equal(P) { - B.setInfinity() + if BK.Equal(P) { + BK.setInfinity() return } } else { // if bucket == -P, B == 0 - if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { - B.setInfinity() + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() return } } // b.bucketIds[b.cptP] = op.bucketID b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = B + b.R[b.cptP] = BK if op.isNeg() { b.P[b.cptP].Neg(P) } else { @@ -701,7 +895,7 @@ func (b *BatchG2Affine) Add(op batchOp) { b.cptP++ } -func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { +func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { for i := len(queue) - 1; i >= 0; i-- { if batch.CanAdd(queue[i].bucketID) { batch.Add(queue[i]) @@ -716,16 +910,15 @@ func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp { } -func msmProcessChunkG2AffineBatchAffine(chunk uint64, +func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, chRes chan<- g2JacExtended, - buckets []G2Affine, c uint64, points []G2Affine, scalars []fr.Element) { mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) - + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -742,7 +935,7 @@ func msmProcessChunkG2AffineBatchAffine(chunk uint64, s.shiftHigh = (c - nbBitsHigh) } - batch := newBatchG2Affine(buckets, points) + batch := newBatchG2Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 for i := 0; i < len(scalars); i++ { @@ -811,11 +1004,12 @@ func msmProcessChunkG2AffineBatchAffine(chunk uint64, } -func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - const ( - c = 16 // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { + + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -823,14 +1017,25 @@ func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, split // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks]chan g2JacExtended + chChunks := make([]chan g2JacExtended, nbChunks) for i := 0; i < len(chChunks); i++ { chChunks[i] = make(chan g2JacExtended, 1) } + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []G2Affine, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]g2JacExtended, 1<<(lastC-1)) + // TODO @gbotrel lastC restore. + msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks-1), points, scalars) + nbChunks-- + } + processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - var buckets [1 << (c - 1)]G2Affine - msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j > 0; j-- { @@ -853,5 +1058,104 @@ func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, split }() } - return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:]) + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) +} + +type bucketG2AffineC1 [1 << (1 - 1)]G2Affine +type bucketG2AffineC2 [1 << (2 - 1)]G2Affine +type bucketG2AffineC3 [1 << (3 - 1)]G2Affine +type bucketG2AffineC4 [1 << (4 - 1)]G2Affine +type bucketG2AffineC5 [1 << (5 - 1)]G2Affine +type bucketG2AffineC6 [1 << (6 - 1)]G2Affine +type bucketG2AffineC7 [1 << (7 - 1)]G2Affine +type bucketG2AffineC8 [1 << (8 - 1)]G2Affine +type bucketG2AffineC9 [1 << (9 - 1)]G2Affine +type bucketG2AffineC10 [1 << (10 - 1)]G2Affine +type bucketG2AffineC11 [1 << (11 - 1)]G2Affine +type bucketG2AffineC12 [1 << (12 - 1)]G2Affine +type bucketG2AffineC13 [1 << (13 - 1)]G2Affine +type bucketG2AffineC14 [1 << (14 - 1)]G2Affine +type bucketG2AffineC15 [1 << (15 - 1)]G2Affine +type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +type bucketG2AffineC17 [1 << (17 - 1)]G2Affine +type bucketG2AffineC18 [1 << (18 - 1)]G2Affine +type bucketG2AffineC19 [1 << (19 - 1)]G2Affine +type bucketG2AffineC20 [1 << (20 - 1)]G2Affine +type bucketG2AffineC21 [1 << (21 - 1)]G2Affine +type bucketG2AffineC22 [1 << (22 - 1)]G2Affine +type bucketG2AffineC23 [1 << (23 - 1)]G2Affine +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended +type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended +type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended +type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended +type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended +type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended +type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended +type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended +type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended +type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended +type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended +type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended +type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended +type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended +type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended +type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended +type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended +type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended +type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended + +type ibG2Affine interface { + bucketG2AffineC1 | + bucketG2AffineC2 | + bucketG2AffineC3 | + bucketG2AffineC4 | + bucketG2AffineC5 | + bucketG2AffineC6 | + bucketG2AffineC7 | + bucketG2AffineC8 | + bucketG2AffineC9 | + bucketG2AffineC10 | + bucketG2AffineC11 | + bucketG2AffineC12 | + bucketG2AffineC13 | + bucketG2AffineC14 | + bucketG2AffineC15 | + bucketG2AffineC16 | + bucketG2AffineC17 | + bucketG2AffineC18 | + bucketG2AffineC19 | + bucketG2AffineC20 | + bucketG2AffineC21 | + bucketG2AffineC22 | + bucketG2AffineC23 +} + +type ibg2JacExtended interface { + bucketg2JacExtendedC1 | + bucketg2JacExtendedC2 | + bucketg2JacExtendedC3 | + bucketg2JacExtendedC4 | + bucketg2JacExtendedC5 | + bucketg2JacExtendedC6 | + bucketg2JacExtendedC7 | + bucketg2JacExtendedC8 | + bucketg2JacExtendedC9 | + bucketg2JacExtendedC10 | + bucketg2JacExtendedC11 | + bucketg2JacExtendedC12 | + bucketg2JacExtendedC13 | + bucketg2JacExtendedC14 | + bucketg2JacExtendedC15 | + bucketg2JacExtendedC16 | + bucketg2JacExtendedC17 | + bucketg2JacExtendedC18 | + bucketg2JacExtendedC19 | + bucketg2JacExtendedC20 | + bucketg2JacExtendedC21 | + bucketg2JacExtendedC22 | + bucketg2JacExtendedC23 } diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index d5b1288c1e..8d851d2d42 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - r16.msmC16(samplePoints[:], scalars16, true) + msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 8, 16} + cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 5; i <= pow; i++ { + for i := 11; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - r16.msmC16(samplePoints[:], scalars16, true) + msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 5; i <= pow; i++ { + for i := 11; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { diff --git a/internal/generator/ecc/generate.go b/internal/generator/ecc/generate.go index bc367c5c14..6af6b7d54a 100644 --- a/internal/generator/ecc/generate.go +++ b/internal/generator/ecc/generate.go @@ -3,6 +3,7 @@ package ecc import ( "fmt" "path/filepath" + "reflect" "strings" "text/template" @@ -21,7 +22,36 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er {File: filepath.Join(baseDir, "marshal_test.go"), Templates: []string{"tests/marshal.go.tmpl"}}, } conf.Package = packageName - if err := bgen.Generate(conf, packageName, "./ecc/template", entries...); err != nil { + funcs := make(template.FuncMap) + funcs["last"] = func(x int, a interface{}) bool { + return x == reflect.ValueOf(a).Len()-1 + } + funcs["lastC"] = func(c int) int { + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // if c divides fr.Limbs * 64; + n := (conf.Fr.NbWords * 64) + if n%c == 0 { + return c + } + return n - (c * (n / c)) + } + funcs["contains"] = func(v int, s []int) bool { + for _, sv := range s { + if v == sv { + return true + } + } + return false + } + // TODO @gbotrel fix me. need to generate usual C, and missing lastC for bucket size. + conf.G1.CRange = make([]int, 23) + conf.G2.CRange = make([]int, 23) + for i := 0; i < len(conf.G1.CRange); i++ { + conf.G1.CRange[i] = i + 1 + conf.G2.CRange[i] = i + 1 + } + bavardOpts := []func(*bavard.Bavard) error{bavard.Funcs(funcs)} + if err := bgen.GenerateWithOptions(conf, packageName, "./ecc/template", bavardOpts, entries...); err != nil { return err } diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index e476f889b7..62b5f03f62 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -299,7 +299,7 @@ func msmInner{{ $.TJacobian }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffin switch c { {{range $c := $.CRange}} case {{$c}}: - p.msmC{{$c}}(points, scalars, splitFirstChunk) + msmC{{ $.TAffine }}[bucket{{ $.TJacobianExtended }}C{{$c}}, bucket{{ $.TJacobianExtended }}C{{lastC $c}}](p, {{$c}}, points, scalars, splitFirstChunk) {{end}} default: panic("not implemented") @@ -323,9 +323,8 @@ func msmReduceChunk{{ $.TAffine }}(p *{{ $.TJacobian }}, c int, chChunks []chan } -func msmProcessChunk{{ $.TAffine }}(chunk uint64, +func msmProcessChunk{{ $.TAffine }}[B ib{{ $.TJacobianExtended }}](chunk uint64, chRes chan<- {{ $.TJacobianExtended }}, - buckets []{{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element) { @@ -334,6 +333,7 @@ func msmProcessChunk{{ $.TAffine }}(chunk uint64, mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c -1)) + var buckets B for i := 0 ; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -391,44 +391,41 @@ func msmProcessChunk{{ $.TAffine }}(chunk uint64, } -{{range $c := $.CRange}} - -{{- $frBits := mul $.FrNbWords 64}} -{{- $cDividesBits := divides $c $frBits}} -{{- $nbChunks := div $frBits $c}} - -func (p *{{ $.TJacobian }}) msmC{{$c}}(points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) *{{ $.TJacobian }} { - const ( - c = {{$c}} // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func msmC{{ $.TAffine }}[B ib{{ $.TJacobianExtended }}, LB ib{{ $.TJacobianExtended }}](p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) *{{ $.TJacobian }} { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks{{if not $cDividesBits }} + 1 {{end}} ]chan {{ $.TJacobianExtended }} + chChunks := make([]chan {{ $.TJacobianExtended }}, nbChunks) for i:=0; i < len(chChunks);i++ { chChunks[i] = make(chan {{ $.TJacobianExtended }}, 1) } - {{ if not $cDividesBits }} - // c doesn't divide {{$frBits}}, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. go func(j uint64, points []{{ $.TAffine }}, scalars []fr.Element) { - var buckets [1<<(lastC-1)]{{ $.TJacobianExtended }} - msmProcessChunk{{ $.TAffine }}(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]{{ $.TJacobianExtended }}, 1<<(lastC-1)) + // TODO @gbotrel last C restore. + msmProcessChunk{{ $.TAffine }}[LB](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks - 1), points, scalars) + nbChunks-- + } - {{- end}} processChunk := func(j int, points []{{ $.TAffine }}, scalars []fr.Element, chChunk chan {{ $.TJacobianExtended }}) { - var buckets [1<<(c-1)]{{ $.TJacobianExtended }} - msmProcessChunk{{ $.TAffine }}(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunk{{ $.TAffine }}[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j >0; j-- { @@ -452,9 +449,7 @@ func (p *{{ $.TJacobian }}) msmC{{$c}}(points []{{ $.TAffine }}, scalars []fr.El } - return msmReduceChunk{{ $.TAffine }}(p, c, chChunks[:]) + return msmReduceChunk{{ $.TAffine }}(p, int(c), chChunks[:]) } -{{end}} - {{end }} diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index 02d8c72588..3a803280f3 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -27,7 +27,8 @@ func (o batchOp) isNeg() bool { {{ template "multiexp" dict "PointName" .G1.PointName "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}} -{{ template "multiexp" dict "PointName" .G2.PointName "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange}} +{{ template "multiexp" dict "PointName" .G2.PointName "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}} + {{define "multiexp" }} @@ -169,9 +170,9 @@ func msmInner{{ $.TJacobian }}BatchAffine(p *{{ $.TJacobian }}, c int, points [] {{range $c := $.CRange}} case {{$c}}: {{- if le $c 9}} - p.msmC{{$c}}(points, scalars, splitFirstChunk) + msmC{{ $.TAffine }}[bucket{{ $.TJacobianExtended }}C{{$c}}, bucket{{ $.TJacobianExtended }}C{{lastC $c}}](p, {{$c}}, points, scalars, splitFirstChunk) {{- else}} - p.batchAffineMsmC{{$c}}(points, scalars, splitFirstChunk) + batch{{ $.TAffine }}Msm[bucket{{ $.TAffine }}C{{$c}}, bucket{{ $.TJacobianExtended }}C{{lastC $c}}](p, {{$c}}, points, scalars, splitFirstChunk) {{- end}} {{end}} default: @@ -179,53 +180,38 @@ func msmInner{{ $.TJacobian }}BatchAffine(p *{{ $.TJacobian }}, c int, points [] } } -// msmReduceChunk{{ $.TAffine }}BatchAffine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunk{{ $.TAffine }}BatchAffine(p *{{ $.TJacobian }}, c int, chChunks []chan {{ $.TJacobianExtended }}) *{{ $.TJacobian }} { - var _p {{ $.TJacobianExtended }} - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -type Batch{{ $.TAffine }} struct { +type Batch{{ $.TAffine }}[B ib{{ $.TAffine }}] struct { P [MAX_BATCH_SIZE]{{ $.TAffine }} R [MAX_BATCH_SIZE]*{{ $.TAffine }} batchSize int cptP int bucketIds map[uint32]struct{} - buckets, points []{{ $.TAffine }} + points []{{ $.TAffine }} + buckets *B } -func newBatch{{ $.TAffine }}(buckets, points []{{ $.TAffine }}) Batch{{ $.TAffine }} { - batchSize := len(buckets) / 5 +func newBatch{{ $.TAffine }}[B ib{{ $.TAffine }}](buckets *B, points []{{ $.TAffine }}) Batch{{ $.TAffine }}[B] { + batchSize := len(*buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return Batch{{ $.TAffine }}{ + return Batch{{ $.TAffine }}[B]{ buckets: buckets, points: points, batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(buckets)/2), + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), } } -func (b *Batch{{ $.TAffine }}) IsFull() bool { +func (b *Batch{{ $.TAffine }}[B]) IsFull() bool { return b.cptP == b.batchSize } -func (b *Batch{{ $.TAffine }}) ExecuteAndReset() { +func (b *Batch{{ $.TAffine }}[B]) ExecuteAndReset() { if b.cptP == 0 { return } @@ -240,45 +226,45 @@ func (b *Batch{{ $.TAffine }}) ExecuteAndReset() { b.cptP = 0 } -func (b *Batch{{ $.TAffine }}) CanAdd(bID uint32) bool { +func (b *Batch{{ $.TAffine }}[B]) CanAdd(bID uint32) bool { _, ok := b.bucketIds[bID] return !ok } -func (b *Batch{{ $.TAffine }}) Add(op batchOp) { +func (b *Batch{{ $.TAffine }}[B]) Add(op batchOp) { // CanAdd must be called before --> ensures bucket is not "used" in current batch - B := &b.buckets[op.bucketID] + BK := &(*b.buckets)[op.bucketID] P := &b.points[op.pointID>>1] if P.IsInfinity() { return } // handle special cases with inf or -P / P - if B.IsInfinity() { + if BK.IsInfinity() { if op.isNeg() { - B.Neg(P) + BK.Neg(P) } else { - B.Set(P) + BK.Set(P) } return } if op.isNeg() { // if bucket == P --> -P == 0 - if B.Equal(P) { - B.setInfinity() + if BK.Equal(P) { + BK.setInfinity() return } } else { // if bucket == -P, B == 0 - if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) { - B.setInfinity() + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() return } } // b.bucketIds[b.cptP] = op.bucketID b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = B + b.R[b.cptP] = BK if op.isNeg() { b.P[b.cptP].Neg(P) } else { @@ -287,7 +273,7 @@ func (b *Batch{{ $.TAffine }}) Add(op batchOp) { b.cptP++ } -func processQueue{{ $.TAffine }}(queue []batchOp, batch *Batch{{ $.TAffine }}) []batchOp { +func processQueue{{ $.TAffine }}[B ib{{ $.TAffine }}](queue []batchOp, batch *Batch{{ $.TAffine }}[B]) []batchOp { for i := len(queue) - 1; i >= 0; i-- { if batch.CanAdd(queue[i].bucketID) { batch.Add(queue[i]) @@ -302,16 +288,15 @@ func processQueue{{ $.TAffine }}(queue []batchOp, batch *Batch{{ $.TAffine }}) [ } -func msmProcessChunk{{ $.TAffine }}BatchAffine(chunk uint64, +func msmProcessChunk{{ $.TAffine }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64, chRes chan<- {{ $.TJacobianExtended }}, - buckets []{{ $.TAffine }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element) { mask := uint64((1 << c) - 1) // low c bits are 1 msbWindow := uint64(1 << (c - 1)) - + var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } @@ -328,7 +313,7 @@ func msmProcessChunk{{ $.TAffine }}BatchAffine(chunk uint64, s.shiftHigh = (c - nbBitsHigh) } - batch := newBatch{{ $.TAffine }}(buckets, points) + batch := newBatch{{ $.TAffine }}(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 for i := 0; i < len(scalars); i++ { @@ -398,17 +383,13 @@ func msmProcessChunk{{ $.TAffine }}BatchAffine(chunk uint64, } -{{range $c := $.CRange}} -{{- if gt $c 9}} -{{- $frBits := mul $.FrNbWords 64}} -{{- $cDividesBits := divides $c $frBits}} -{{- $nbChunks := div $frBits $c}} -func (p *{{ $.TJacobian }}) batchAffineMsmC{{$c}}(points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) *{{ $.TJacobian }} { - const ( - c = {{$c}} // scalars partitioned into c-bit radixes - nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - ) +func batch{{ $.TAffine }}Msm[B ib{{ $.TAffine }}, J ib{{ $.TJacobianExtended }}](p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) *{{ $.TJacobian }} { + + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -416,33 +397,26 @@ func (p *{{ $.TJacobian }}) batchAffineMsmC{{$c}}(points []{{ $.TAffine }}, scal // critical for performance // each go routine sends its result in chChunks[i] channel - var chChunks [nbChunks{{if not $cDividesBits }} + 1 {{end}} ]chan {{ $.TJacobianExtended }} + chChunks := make([]chan {{ $.TJacobianExtended }}, nbChunks) for i:=0; i < len(chChunks);i++ { chChunks[i] = make(chan {{ $.TJacobianExtended }}, 1) } + if (fr.Limbs*64)%c != 0 { + // TODO @gbotrel not always needed to do ext jac here. + go func(j uint64, points []{{ $.TAffine }}, scalars []fr.Element) { + // var buckets LB + // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + // buckets := make([]{{ $.TJacobianExtended }}, 1<<(lastC-1)) + // TODO @gbotrel lastC restore. + msmProcessChunk{{ $.TAffine }}[J](j, chChunks[j], c, points, scalars) + }(uint64(nbChunks - 1), points, scalars) + nbChunks-- + } - {{ if not $cDividesBits }} - - // c doesn't divide {{$frBits}}, last window is smaller we can allocate less buckets - const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // TODO @gbotrel replace this in code generator - if lastC >= 10 { - go func(j uint64, points []{{ $.TAffine }}, scalars []fr.Element) { - var buckets [1<<(lastC-1)]{{ $.TAffine }} - msmProcessChunk{{ $.TAffine }}BatchAffine(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } else { - go func(j uint64, points []{{ $.TAffine }}, scalars []fr.Element) { - var buckets [1<<(lastC-1)]{{ $.TJacobianExtended }} - msmProcessChunk{{ $.TAffine }}(j, chChunks[j], buckets[:], c, points, scalars) - }(uint64(nbChunks), points, scalars) - } - {{- end}} processChunk := func(j int, points []{{ $.TAffine }}, scalars []fr.Element, chChunk chan {{ $.TJacobianExtended }}) { - var buckets [1<<(c-1)]{{ $.TAffine }} - msmProcessChunk{{ $.TAffine }}BatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars) + msmProcessChunk{{ $.TAffine }}BatchAffine[B](uint64(j), chChunk, c, points, scalars) } for j := int(nbChunks - 1); j >0; j-- { @@ -465,10 +439,29 @@ func (p *{{ $.TJacobian }}) batchAffineMsmC{{$c}}(points []{{ $.TAffine }}, scal }() } - return msmReduceChunk{{ $.TAffine }}BatchAffine(p, c, chChunks[:]) + return msmReduceChunk{{ $.TAffine }}(p, int(c), chChunks[:]) } + + + + +{{- range $c := $.CRange}} +type bucket{{ $.TAffine }}C{{$c}} [1<<({{$c}}-1)]{{ $.TAffine }} +{{- end}} +{{- range $c := $.CRange}} +type bucket{{ $.TJacobianExtended }}C{{$c}} [1<<({{$c}}-1)]{{ $.TJacobianExtended }} {{- end}} -{{end}} +type ib{{ $.TAffine }} interface { + {{- range $i, $c := $.CRange}} + bucket{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}} + {{- end}} +} + +type ib{{ $.TJacobianExtended }} interface { + {{- range $i, $c := $.CRange}} + bucket{{ $.TJacobianExtended }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}} + {{- end}} +} {{end }} diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index cde8bd0b2a..93e26a09d6 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -91,7 +91,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - r16.msmC16(samplePoints[:], scalars16, true) + msmC{{ $.TAffine }}[bucket{{ $.TJacobianExtended }}C16, bucket{{ $.TJacobianExtended }}C{{lastC 16}}](&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -287,7 +287,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { var testPoint {{ $.TAffine }} - for i := 5; i <= pow; i++ { + for i := 11; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { From 5edbf300a5f06062a64953552e52dcc9071aa924 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 8 Nov 2022 14:36:18 -0600 Subject: [PATCH 04/43] feat,style: factorize code between extjac and affine msm using generics --- ecc/bls12-377/multiexp.go | 760 ++++++--------- ecc/bls12-377/multiexp_affine.go | 920 +++--------------- ecc/bls12-377/multiexp_jacobian.go | 229 +++++ ecc/bls12-377/multiexp_test.go | 34 +- ecc/bls12-378/multiexp.go | 760 ++++++--------- ecc/bls12-378/multiexp_affine.go | 920 +++--------------- ecc/bls12-378/multiexp_jacobian.go | 229 +++++ ecc/bls12-378/multiexp_test.go | 34 +- ecc/bls12-381/multiexp.go | 760 ++++++--------- ecc/bls12-381/multiexp_affine.go | 920 +++--------------- ecc/bls12-381/multiexp_jacobian.go | 229 +++++ ecc/bls12-381/multiexp_test.go | 34 +- ecc/bls24-315/multiexp.go | 760 ++++++--------- ecc/bls24-315/multiexp_affine.go | 920 +++--------------- ecc/bls24-315/multiexp_jacobian.go | 229 +++++ ecc/bls24-315/multiexp_test.go | 34 +- ecc/bls24-317/multiexp.go | 760 ++++++--------- ecc/bls24-317/multiexp_affine.go | 920 +++--------------- ecc/bls24-317/multiexp_jacobian.go | 229 +++++ ecc/bls24-317/multiexp_test.go | 34 +- ecc/bn254/multiexp.go | 760 ++++++--------- ecc/bn254/multiexp_affine.go | 920 +++--------------- ecc/bn254/multiexp_jacobian.go | 229 +++++ ecc/bn254/multiexp_test.go | 34 +- ecc/bw6-633/multiexp.go | 710 +++++--------- ecc/bw6-633/multiexp_affine.go | 920 +++--------------- ecc/bw6-633/multiexp_jacobian.go | 177 ++++ ecc/bw6-633/multiexp_test.go | 34 +- ecc/bw6-756/multiexp.go | 712 +++++--------- ecc/bw6-756/multiexp_affine.go | 920 +++--------------- ecc/bw6-756/multiexp_jacobian.go | 177 ++++ ecc/bw6-756/multiexp_test.go | 34 +- ecc/bw6-761/multiexp.go | 712 +++++--------- ecc/bw6-761/multiexp_affine.go | 920 +++--------------- ecc/bw6-761/multiexp_jacobian.go | 177 ++++ ecc/bw6-761/multiexp_test.go | 34 +- internal/generator/config/curve.go | 20 +- internal/generator/ecc/generate.go | 29 +- .../generator/ecc/template/multiexp.go.tmpl | 212 ++-- .../ecc/template/multiexp_affine.go.tmpl | 406 ++------ .../ecc/template/multiexp_jacobian.go.tmpl | 106 ++ .../ecc/template/tests/multiexp.go.tmpl | 16 +- 42 files changed, 6040 insertions(+), 11934 deletions(-) create mode 100644 ecc/bls12-377/multiexp_jacobian.go create mode 100644 ecc/bls12-378/multiexp_jacobian.go create mode 100644 ecc/bls12-381/multiexp_jacobian.go create mode 100644 ecc/bls24-315/multiexp_jacobian.go create mode 100644 ecc/bls24-317/multiexp_jacobian.go create mode 100644 ecc/bn254/multiexp_jacobian.go create mode 100644 ecc/bw6-633/multiexp_jacobian.go create mode 100644 ecc/bw6-756/multiexp_jacobian.go create mode 100644 ecc/bw6-761/multiexp_jacobian.go create mode 100644 internal/generator/ecc/template/multiexp_jacobian.go.tmpl diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index 0f487a104c..1673861355 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -25,143 +25,6 @@ import ( "runtime" ) -// selector stores the index, mask and shifts needed to select bits from a scalar -// it is used during the multiExp algorithm or the batch scalar multiplication -type selector struct { - index uint64 // index in the multi-word scalar to select bits from - mask uint64 // mask (c-bit wide) - shift uint64 // shift needed to get our bits on low positions - - multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) - maskHigh uint64 // same than mask, for index+1 - shiftHigh uint64 // same than shift, for index+1 -} - -// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits -// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract -// 2^{c} to the current digit, making it negative. -// negative digits can be processed in a later step as adding -G into the bucket instead of G -// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) -// scalarsMont indicates wheter the provided scalars are in montgomery form -// returns smallValues, which represent the number of scalars which meets the following condition -// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { - toReturn := make([]fr.Element, len(scalars)) - - // number of c-bit radixes in a scalar - nbChunks := fr.Limbs * 64 / c - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window - max := int(1 << (c - 1)) // max value we want for our digits - cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words - - // compute offset and word selector / shift to select the right bits of our windows - selectors := make([]selector, nbChunks) - for chunk := uint64(0); chunk < nbChunks; chunk++ { - jc := uint64(chunk * c) - d := selector{} - d.index = jc / 64 - d.shift = jc - (d.index * 64) - d.mask = mask << d.shift - d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) - if d.multiWordSelect { - nbBitsHigh := d.shift - uint64(64-c) - d.maskHigh = (1 << nbBitsHigh) - 1 - d.shiftHigh = (c - nbBitsHigh) - } - selectors[chunk] = d - } - - // for each chunk, we could track the number of non-zeros points we will need to process - // this way, if a chunk has more work to do than others, we can spawn off more go routines - // (at the cost of more buckets allocated) - // a simplified approach is to track the small values where only the first word is set - // if this number represent a significant number of points, then we will split first chunk - // processing in the msm in 2, to ensure all go routines finish at ~same time - // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine - // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) - - parallel.Execute(len(scalars), func(start, end int) { - smallValues := 0 - for i := start; i < end; i++ { - var carry int - - scalar := scalars[i] - if scalarsMont { - scalar.FromMont() - } - if scalar.FitsOnOneWord() { - // everything is 0, no need to process this scalar - if scalar[0] == 0 { - continue - } - // low c-bits are 1 in mask - if scalar[0]&mask == scalar[0] { - smallValues++ - } - } - - // for each chunk in the scalar, compute the current digit, and an eventual carry - for chunk := uint64(0); chunk < nbChunks; chunk++ { - s := selectors[chunk] - - // init with carry if any - digit := carry - carry = 0 - - // digit = value of the c-bit window - digit += int((scalar[s.index] & s.mask) >> s.shift) - - if s.multiWordSelect { - // we are selecting bits over 2 words - digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh - } - - // if digit is zero, no impact on result - if digit == 0 { - continue - } - - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - if digit >= max { - digit -= (1 << c) - carry = 1 - } - - var bits uint64 - if digit >= 0 { - bits = uint64(digit) - } else { - bits = uint64(-digit-1) | msbWindow - } - - toReturn[i][s.index] |= (bits << s.shift) - if s.multiWordSelect { - toReturn[i][s.index+1] |= (bits >> s.shiftHigh) - } - - } - } - - chSmallValues <- smallValues - - }, nbTasks) - - // aggregate small values - close(chSmallValues) - smallValues := 0 - for o := range chSmallValues { - smallValues += o - } - return toReturn, smallValues -} - // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf // // This call return an error if len(scalars) != len(points) or if provided config is invalid. @@ -221,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -266,7 +129,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG1Jac , but that would incur a cost of looping through all scalars one more time + // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 // we have nbSplits intermediate results that we must sum together. @@ -276,12 +139,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul start := i * nbPoints end := start + nbPoints go func(start, end, i int) { - msmInnerG1Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) chDone <- i }(start, end, i) } - msmInnerG1Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) for i := 0; i < nbSplits-1; i++ { done := <-chDone p.AddAssign(&_p[done]) @@ -290,169 +153,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { - +func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { switch c { - case 1: - msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - case 4: - msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) case 5: - msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] + _innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 6: - msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 7: - msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 8: - msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] + _innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) case 9: - msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 10: - msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC10] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] + _innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 11: - msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC11] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] + _innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 12: - msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC12] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 13: - msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC13] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] + _innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 14: - msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC14] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 15: - msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC15] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] + _innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 16: - msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - - case 18: - msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - - case 19: - msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC16] + _innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) case 20: - msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC20] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] + _innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 21: - msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - - case 22: - msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - - case 23: - msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC21] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { - var _p g1JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64, - chRes chan<- g1JacExtended, - c uint64, - points []G1Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac { - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) - } - } - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g1JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { nbChunks++ } + // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is @@ -464,45 +237,54 @@ func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, poi chChunks[i] = make(chan g1JacExtended, 1) } - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G1Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g1JacExtended, 1<<(lastC-1)) - // TODO @gbotrel last C restore. - msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } + // the last chunk may be processed with a different method than the rest, as it could be smaller. + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars) + for j := int(nbChunks - 2); j > 0; j-- { + go processChunk(uint64(j), chChunks[j], c, points, scalars) } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] + // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed + // in the ~same amount of time + if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + if !splitFirstChunk { + go processChunk(0, chChunks[0], c, points, scalars) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, chSplit, c, points[:split], scalars[:split]) + go processChunk(0, chSplit, c, points[split:], scalars[split:]) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } +// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { + var _p g1JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf // // This call return an error if len(scalars) != len(points) or if provided config is invalid. @@ -562,7 +344,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -607,7 +389,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time + // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 // we have nbSplits intermediate results that we must sum together. @@ -617,12 +399,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul start := i * nbPoints end := start + nbPoints go func(start, end, i int) { - msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) chDone <- i }(start, end, i) } - msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) for i := 0; i < nbSplits-1; i++ { done := <-chDone p.AddAssign(&_p[done]) @@ -631,82 +413,120 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - +func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { switch c { - case 1: - msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - case 4: - msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) case 5: - msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] + _innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 6: - msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 7: - msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 8: - msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] + _innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) case 9: - msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 10: - msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC10] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] + _innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 11: - msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC11] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] + _innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 12: - msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC12] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 13: - msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC13] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] + _innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 14: - msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC14] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 15: - msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC15] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] + _innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 16: - msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + processChunk := processChunkG2BatchAffine[bucketG2AffineC16] + _innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + case 20: + processChunk := processChunkG2BatchAffine[bucketG2AffineC20] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] + _innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) + case 21: + processChunk := processChunkG2BatchAffine[bucketG2AffineC21] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) + default: + panic("not implemented") + } +} - case 17: - msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac { - case 18: - msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } - case 19: - msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance - case 20: - msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } - case 21: - msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) + // the last chunk may be processed with a different method than the rest, as it could be smaller. + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) - case 22: - msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) + for j := int(nbChunks - 2); j > 0; j-- { + go processChunk(uint64(j), chChunks[j], c, points, scalars) + } - case 23: - msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) + // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] + // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed + // in the ~same amount of time + if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + if !splitFirstChunk { + go processChunk(0, chChunks[0], c, points, scalars) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, chSplit, c, points[:split], scalars[:split]) + go processChunk(0, chSplit, c, points[split:], scalars[split:]) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } - default: - panic("not implemented") } + + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp @@ -725,121 +545,139 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - scalars []fr.Element) { +// selector stores the index, mask and shifts needed to select bits from a scalar +// it is used during the multiExp algorithm or the batch scalar multiplication +type selector struct { + index uint64 // index in the multi-word scalar to select bits from + mask uint64 // mask (c-bit wide) + shift uint64 // shift needed to get our bits on low positions - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) + maskHigh uint64 // same than mask, for index+1 + shiftHigh uint64 // same than shift, for index+1 +} - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { + toReturn := make([]fr.Element, len(scalars)) - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) } + selectors[chunk] = d } - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int - chRes <- total + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.FitsOnOneWord() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } -} + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] -func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // init with carry if any + digit := carry + carry = 0 - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G2Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g2JacExtended, 1<<(lastC-1)) - // TODO @gbotrel last C restore. - msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars) - } + // if digit is zero, no impact on result + if digit == 0 { + continue + } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + var bits uint64 + if digit >= 0 { + bits = uint64(digit) + } else { + bits = uint64(-digit-1) | msbWindow + } - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) + toReturn[i][s.index] |= (bits << s.shift) + if s.multiWordSelect { + toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues } diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index 8ff827c422..ac8d41cbb6 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -17,11 +17,7 @@ package bls12377 import ( - "errors" - "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" - "math" - "runtime" ) const MAX_BATCH_SIZE = 600 @@ -34,320 +30,13 @@ func (o batchOp) isNeg() bool { return o.pointID&1 == 1 } -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// processChunkG1BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. // -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { - var _p G1Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G1Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 1: - msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - - case 4: - msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - - case 5: - msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - - case 8: - msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - - case 10: - batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - - case 11: - batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - - case 12: - batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - - case 13: - batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - - case 14: - batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - - case 15: - batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - - case 16: - batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - - case 18: - batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - - case 19: - batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - - case 20: - batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - - case 21: - batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - - case 22: - batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - - case 23: - batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } -} - -type BatchG1Affine[B ibG1Affine] struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G1Affine - buckets *B -} - -func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { - batchSize := len(*buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG1Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} - -func (b *BatchG1Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG1Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG1Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} - -func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -441,66 +130,8 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, } -func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g1JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G1Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g1JacExtended, 1<<(lastC-1)) - // TODO @gbotrel lastC restore. - msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, int(c), chChunks[:]) -} - -type bucketG1AffineC1 [1 << (1 - 1)]G1Affine -type bucketG1AffineC2 [1 << (2 - 1)]G1Affine -type bucketG1AffineC3 [1 << (3 - 1)]G1Affine +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack type bucketG1AffineC4 [1 << (4 - 1)]G1Affine type bucketG1AffineC5 [1 << (5 - 1)]G1Affine type bucketG1AffineC6 [1 << (6 - 1)]G1Affine @@ -514,42 +145,11 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine type bucketG1AffineC14 [1 << (14 - 1)]G1Affine type bucketG1AffineC15 [1 << (15 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine -type bucketG1AffineC17 [1 << (17 - 1)]G1Affine -type bucketG1AffineC18 [1 << (18 - 1)]G1Affine -type bucketG1AffineC19 [1 << (19 - 1)]G1Affine type bucketG1AffineC20 [1 << (20 - 1)]G1Affine type bucketG1AffineC21 [1 << (21 - 1)]G1Affine -type bucketG1AffineC22 [1 << (22 - 1)]G1Affine -type bucketG1AffineC23 [1 << (23 - 1)]G1Affine -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended -type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended -type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended -type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended -type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended -type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended -type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended -type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended -type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended -type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended -type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended -type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended -type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended -type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended -type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended -type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended -type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended -type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended -type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended type ibG1Affine interface { - bucketG1AffineC1 | - bucketG1AffineC2 | - bucketG1AffineC3 | - bucketG1AffineC4 | + bucketG1AffineC4 | bucketG1AffineC5 | bucketG1AffineC6 | bucketG1AffineC7 | @@ -562,258 +162,21 @@ type ibG1Affine interface { bucketG1AffineC14 | bucketG1AffineC15 | bucketG1AffineC16 | - bucketG1AffineC17 | - bucketG1AffineC18 | - bucketG1AffineC19 | bucketG1AffineC20 | - bucketG1AffineC21 | - bucketG1AffineC22 | - bucketG1AffineC23 -} - -type ibg1JacExtended interface { - bucketg1JacExtendedC1 | - bucketg1JacExtendedC2 | - bucketg1JacExtendedC3 | - bucketg1JacExtendedC4 | - bucketg1JacExtendedC5 | - bucketg1JacExtendedC6 | - bucketg1JacExtendedC7 | - bucketg1JacExtendedC8 | - bucketg1JacExtendedC9 | - bucketg1JacExtendedC10 | - bucketg1JacExtendedC11 | - bucketg1JacExtendedC12 | - bucketg1JacExtendedC13 | - bucketg1JacExtendedC14 | - bucketg1JacExtendedC15 | - bucketg1JacExtendedC16 | - bucketg1JacExtendedC17 | - bucketg1JacExtendedC18 | - bucketg1JacExtendedC19 | - bucketg1JacExtendedC20 | - bucketg1JacExtendedC21 | - bucketg1JacExtendedC22 | - bucketg1JacExtendedC23 -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 1: - msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - - case 4: - msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - - case 5: - msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - - case 8: - msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - - case 10: - batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - - case 11: - batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - - case 12: - batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - - case 13: - batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - - case 14: - batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - - case 15: - batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - - case 16: - batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - - case 18: - batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - - case 19: - batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - - case 20: - batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - - case 21: - batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - - case 22: - batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - - case 23: - batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } + bucketG1AffineC21 } -type BatchG2Affine[B ibG2Affine] struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine +type BatchG1Affine[B ibG1Affine] struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine batchSize int cptP int bucketIds map[uint32]struct{} - points []G2Affine + points []G1Affine buckets *B } -func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { +func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { batchSize := len(*buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -821,7 +184,7 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine if batchSize <= 0 { batchSize = 1 } - return BatchG2Affine[B]{ + return BatchG1Affine[B]{ buckets: buckets, points: points, batchSize: batchSize, @@ -829,18 +192,18 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine } } -func (b *BatchG2Affine[B]) IsFull() bool { +func (b *BatchG1Affine[B]) IsFull() bool { return b.cptP == b.batchSize } -func (b *BatchG2Affine[B]) ExecuteAndReset() { +func (b *BatchG1Affine[B]) ExecuteAndReset() { if b.cptP == 0 { return } // for i := 0; i < len(b.R); i++ { // b.R[i].Add(b.R[i], b.P[i]) // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) for k := range b.bucketIds { delete(b.bucketIds, k) } @@ -848,12 +211,12 @@ func (b *BatchG2Affine[B]) ExecuteAndReset() { b.cptP = 0 } -func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { +func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { _, ok := b.bucketIds[bID] return !ok } -func (b *BatchG2Affine[B]) Add(op batchOp) { +func (b *BatchG1Affine[B]) Add(op batchOp) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &(*b.buckets)[op.bucketID] @@ -895,7 +258,7 @@ func (b *BatchG2Affine[B]) Add(op batchOp) { b.cptP++ } -func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { +func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { for i := len(queue) - 1; i >= 0; i-- { if batch.CanAdd(queue[i].bucketID) { batch.Add(queue[i]) @@ -910,7 +273,13 @@ func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B] } -func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, +// processChunkG2BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. +// +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -1004,66 +373,8 @@ func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, } -func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G2Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g2JacExtended, 1<<(lastC-1)) - // TODO @gbotrel lastC restore. - msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) -} - -type bucketG2AffineC1 [1 << (1 - 1)]G2Affine -type bucketG2AffineC2 [1 << (2 - 1)]G2Affine -type bucketG2AffineC3 [1 << (3 - 1)]G2Affine +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack type bucketG2AffineC4 [1 << (4 - 1)]G2Affine type bucketG2AffineC5 [1 << (5 - 1)]G2Affine type bucketG2AffineC6 [1 << (6 - 1)]G2Affine @@ -1077,42 +388,11 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine type bucketG2AffineC14 [1 << (14 - 1)]G2Affine type bucketG2AffineC15 [1 << (15 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine -type bucketG2AffineC17 [1 << (17 - 1)]G2Affine -type bucketG2AffineC18 [1 << (18 - 1)]G2Affine -type bucketG2AffineC19 [1 << (19 - 1)]G2Affine type bucketG2AffineC20 [1 << (20 - 1)]G2Affine type bucketG2AffineC21 [1 << (21 - 1)]G2Affine -type bucketG2AffineC22 [1 << (22 - 1)]G2Affine -type bucketG2AffineC23 [1 << (23 - 1)]G2Affine -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended -type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended -type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended -type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended -type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended -type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended -type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended -type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended -type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended -type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended -type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended -type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended -type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended -type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended -type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended -type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended -type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended -type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended -type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended type ibG2Affine interface { - bucketG2AffineC1 | - bucketG2AffineC2 | - bucketG2AffineC3 | - bucketG2AffineC4 | + bucketG2AffineC4 | bucketG2AffineC5 | bucketG2AffineC6 | bucketG2AffineC7 | @@ -1125,37 +405,113 @@ type ibG2Affine interface { bucketG2AffineC14 | bucketG2AffineC15 | bucketG2AffineC16 | - bucketG2AffineC17 | - bucketG2AffineC18 | - bucketG2AffineC19 | bucketG2AffineC20 | - bucketG2AffineC21 | - bucketG2AffineC22 | - bucketG2AffineC23 + bucketG2AffineC21 +} + +type BatchG2Affine[B ibG2Affine] struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G2Affine + buckets *B } -type ibg2JacExtended interface { - bucketg2JacExtendedC1 | - bucketg2JacExtendedC2 | - bucketg2JacExtendedC3 | - bucketg2JacExtendedC4 | - bucketg2JacExtendedC5 | - bucketg2JacExtendedC6 | - bucketg2JacExtendedC7 | - bucketg2JacExtendedC8 | - bucketg2JacExtendedC9 | - bucketg2JacExtendedC10 | - bucketg2JacExtendedC11 | - bucketg2JacExtendedC12 | - bucketg2JacExtendedC13 | - bucketg2JacExtendedC14 | - bucketg2JacExtendedC15 | - bucketg2JacExtendedC16 | - bucketg2JacExtendedC17 | - bucketg2JacExtendedC18 | - bucketg2JacExtendedC19 | - bucketg2JacExtendedC20 | - bucketg2JacExtendedC21 | - bucketg2JacExtendedC22 | - bucketg2JacExtendedC23 +func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { + batchSize := len(*buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG2Affine[B]{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), + } +} + +func (b *BatchG2Affine[B]) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG2Affine[B]) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG2Affine[B]) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &(*b.buckets)[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(P) + } else { + BK.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(P) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = BK + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + } diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go new file mode 100644 index 0000000000..fc89ebd2cc --- /dev/null +++ b/ecc/bls12-377/multiexp_jacobian.go @@ -0,0 +1,229 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package bls12377 + +import ( + "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" +) + +func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, + chRes chan<- g1JacExtended, + c uint64, + points []G1Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended +type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended +type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended +type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended +type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended +type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended +type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended +type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended +type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended +type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended +type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended +type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended +type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended + +type ibg1JacExtended interface { + bucketg1JacExtendedC1 | + bucketg1JacExtendedC3 | + bucketg1JacExtendedC4 | + bucketg1JacExtendedC5 | + bucketg1JacExtendedC6 | + bucketg1JacExtendedC7 | + bucketg1JacExtendedC8 | + bucketg1JacExtendedC9 | + bucketg1JacExtendedC10 | + bucketg1JacExtendedC11 | + bucketg1JacExtendedC12 | + bucketg1JacExtendedC13 | + bucketg1JacExtendedC14 | + bucketg1JacExtendedC15 | + bucketg1JacExtendedC16 | + bucketg1JacExtendedC20 | + bucketg1JacExtendedC21 +} + +func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended +type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended +type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended +type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended +type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended +type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended +type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended +type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended +type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended +type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended +type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended +type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended +type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended + +type ibg2JacExtended interface { + bucketg2JacExtendedC1 | + bucketg2JacExtendedC3 | + bucketg2JacExtendedC4 | + bucketg2JacExtendedC5 | + bucketg2JacExtendedC6 | + bucketg2JacExtendedC7 | + bucketg2JacExtendedC8 | + bucketg2JacExtendedC9 | + bucketg2JacExtendedC10 | + bucketg2JacExtendedC11 | + bucketg2JacExtendedC12 | + bucketg2JacExtendedC13 | + bucketg2JacExtendedC14 | + bucketg2JacExtendedC15 | + bucketg2JacExtendedC16 | + bucketg2JacExtendedC20 | + bucketg2JacExtendedC21 +} diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index a14b7946f2..7882874fda 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) + innerMsmG1(&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} + cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -130,10 +130,10 @@ func TestMultiExpG1(t *testing.T) { results := make([]G1Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&results[i], int(c), samplePoints[:], scalars, false) + innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG1Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true) + innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -171,10 +171,10 @@ func TestMultiExpG1(t *testing.T) { results := make([]G1Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&results[i], int(c), samplePointsZero[:], scalars, false) + innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG1Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) + innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -209,8 +209,8 @@ func TestMultiExpG1(t *testing.T) { var result1, result2 G1Jac for _, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false) - msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + innerMsmG1(&result1, int(c), samplePoints[:], scalars, false) + innerMsmG1(&result2, int(c), samplePoints[:], scalars, false) if !result1.Equal(&result2) { return false } @@ -288,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) { b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { - testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) } @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) + innerMsmG2(&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -461,10 +461,10 @@ func TestMultiExpG2(t *testing.T) { results := make([]G2Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&results[i], int(c), samplePoints[:], scalars, false) + innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG2Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true) + innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -502,10 +502,10 @@ func TestMultiExpG2(t *testing.T) { results := make([]G2Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&results[i], int(c), samplePointsZero[:], scalars, false) + innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG2Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) + innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -540,8 +540,8 @@ func TestMultiExpG2(t *testing.T) { var result1, result2 G2Jac for _, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false) - msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + innerMsmG2(&result1, int(c), samplePoints[:], scalars, false) + innerMsmG2(&result2, int(c), samplePoints[:], scalars, false) if !result1.Equal(&result2) { return false } @@ -619,7 +619,7 @@ func BenchmarkMultiExpG2(b *testing.B) { b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { - testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) } diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index ebc19dc090..862cca829b 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -25,143 +25,6 @@ import ( "runtime" ) -// selector stores the index, mask and shifts needed to select bits from a scalar -// it is used during the multiExp algorithm or the batch scalar multiplication -type selector struct { - index uint64 // index in the multi-word scalar to select bits from - mask uint64 // mask (c-bit wide) - shift uint64 // shift needed to get our bits on low positions - - multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) - maskHigh uint64 // same than mask, for index+1 - shiftHigh uint64 // same than shift, for index+1 -} - -// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits -// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract -// 2^{c} to the current digit, making it negative. -// negative digits can be processed in a later step as adding -G into the bucket instead of G -// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) -// scalarsMont indicates wheter the provided scalars are in montgomery form -// returns smallValues, which represent the number of scalars which meets the following condition -// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { - toReturn := make([]fr.Element, len(scalars)) - - // number of c-bit radixes in a scalar - nbChunks := fr.Limbs * 64 / c - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window - max := int(1 << (c - 1)) // max value we want for our digits - cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words - - // compute offset and word selector / shift to select the right bits of our windows - selectors := make([]selector, nbChunks) - for chunk := uint64(0); chunk < nbChunks; chunk++ { - jc := uint64(chunk * c) - d := selector{} - d.index = jc / 64 - d.shift = jc - (d.index * 64) - d.mask = mask << d.shift - d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) - if d.multiWordSelect { - nbBitsHigh := d.shift - uint64(64-c) - d.maskHigh = (1 << nbBitsHigh) - 1 - d.shiftHigh = (c - nbBitsHigh) - } - selectors[chunk] = d - } - - // for each chunk, we could track the number of non-zeros points we will need to process - // this way, if a chunk has more work to do than others, we can spawn off more go routines - // (at the cost of more buckets allocated) - // a simplified approach is to track the small values where only the first word is set - // if this number represent a significant number of points, then we will split first chunk - // processing in the msm in 2, to ensure all go routines finish at ~same time - // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine - // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) - - parallel.Execute(len(scalars), func(start, end int) { - smallValues := 0 - for i := start; i < end; i++ { - var carry int - - scalar := scalars[i] - if scalarsMont { - scalar.FromMont() - } - if scalar.FitsOnOneWord() { - // everything is 0, no need to process this scalar - if scalar[0] == 0 { - continue - } - // low c-bits are 1 in mask - if scalar[0]&mask == scalar[0] { - smallValues++ - } - } - - // for each chunk in the scalar, compute the current digit, and an eventual carry - for chunk := uint64(0); chunk < nbChunks; chunk++ { - s := selectors[chunk] - - // init with carry if any - digit := carry - carry = 0 - - // digit = value of the c-bit window - digit += int((scalar[s.index] & s.mask) >> s.shift) - - if s.multiWordSelect { - // we are selecting bits over 2 words - digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh - } - - // if digit is zero, no impact on result - if digit == 0 { - continue - } - - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - if digit >= max { - digit -= (1 << c) - carry = 1 - } - - var bits uint64 - if digit >= 0 { - bits = uint64(digit) - } else { - bits = uint64(-digit-1) | msbWindow - } - - toReturn[i][s.index] |= (bits << s.shift) - if s.multiWordSelect { - toReturn[i][s.index+1] |= (bits >> s.shiftHigh) - } - - } - } - - chSmallValues <- smallValues - - }, nbTasks) - - // aggregate small values - close(chSmallValues) - smallValues := 0 - for o := range chSmallValues { - smallValues += o - } - return toReturn, smallValues -} - // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf // // This call return an error if len(scalars) != len(points) or if provided config is invalid. @@ -221,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -266,7 +129,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG1Jac , but that would incur a cost of looping through all scalars one more time + // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 // we have nbSplits intermediate results that we must sum together. @@ -276,12 +139,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul start := i * nbPoints end := start + nbPoints go func(start, end, i int) { - msmInnerG1Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) chDone <- i }(start, end, i) } - msmInnerG1Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) for i := 0; i < nbSplits-1; i++ { done := <-chDone p.AddAssign(&_p[done]) @@ -290,169 +153,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { - +func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { switch c { - case 1: - msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - case 4: - msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) case 5: - msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] + _innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 6: - msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 7: - msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 8: - msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] + _innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) case 9: - msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 10: - msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC10] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] + _innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 11: - msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC11] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] + _innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 12: - msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC12] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 13: - msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC13] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] + _innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 14: - msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC14] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 15: - msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC15] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] + _innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 16: - msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - - case 18: - msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - - case 19: - msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC16] + _innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) case 20: - msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC20] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] + _innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 21: - msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - - case 22: - msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - - case 23: - msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC21] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { - var _p g1JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64, - chRes chan<- g1JacExtended, - c uint64, - points []G1Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac { - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) - } - } - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g1JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { nbChunks++ } + // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is @@ -464,45 +237,54 @@ func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, poi chChunks[i] = make(chan g1JacExtended, 1) } - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G1Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g1JacExtended, 1<<(lastC-1)) - // TODO @gbotrel last C restore. - msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } + // the last chunk may be processed with a different method than the rest, as it could be smaller. + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars) + for j := int(nbChunks - 2); j > 0; j-- { + go processChunk(uint64(j), chChunks[j], c, points, scalars) } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] + // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed + // in the ~same amount of time + if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + if !splitFirstChunk { + go processChunk(0, chChunks[0], c, points, scalars) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, chSplit, c, points[:split], scalars[:split]) + go processChunk(0, chSplit, c, points[split:], scalars[split:]) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } +// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { + var _p g1JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf // // This call return an error if len(scalars) != len(points) or if provided config is invalid. @@ -562,7 +344,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -607,7 +389,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time + // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 // we have nbSplits intermediate results that we must sum together. @@ -617,12 +399,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul start := i * nbPoints end := start + nbPoints go func(start, end, i int) { - msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) chDone <- i }(start, end, i) } - msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) for i := 0; i < nbSplits-1; i++ { done := <-chDone p.AddAssign(&_p[done]) @@ -631,82 +413,120 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - +func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { switch c { - case 1: - msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - case 4: - msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) case 5: - msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] + _innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 6: - msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 7: - msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 8: - msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] + _innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) case 9: - msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 10: - msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC10] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] + _innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 11: - msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC11] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] + _innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 12: - msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC12] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 13: - msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC13] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] + _innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 14: - msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC14] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 15: - msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC15] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] + _innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 16: - msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + processChunk := processChunkG2BatchAffine[bucketG2AffineC16] + _innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + case 20: + processChunk := processChunkG2BatchAffine[bucketG2AffineC20] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] + _innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) + case 21: + processChunk := processChunkG2BatchAffine[bucketG2AffineC21] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) + default: + panic("not implemented") + } +} - case 17: - msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac { - case 18: - msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } - case 19: - msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance - case 20: - msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } - case 21: - msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) + // the last chunk may be processed with a different method than the rest, as it could be smaller. + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) - case 22: - msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) + for j := int(nbChunks - 2); j > 0; j-- { + go processChunk(uint64(j), chChunks[j], c, points, scalars) + } - case 23: - msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) + // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] + // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed + // in the ~same amount of time + if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + if !splitFirstChunk { + go processChunk(0, chChunks[0], c, points, scalars) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, chSplit, c, points[:split], scalars[:split]) + go processChunk(0, chSplit, c, points[split:], scalars[split:]) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } - default: - panic("not implemented") } + + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp @@ -725,121 +545,139 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - scalars []fr.Element) { +// selector stores the index, mask and shifts needed to select bits from a scalar +// it is used during the multiExp algorithm or the batch scalar multiplication +type selector struct { + index uint64 // index in the multi-word scalar to select bits from + mask uint64 // mask (c-bit wide) + shift uint64 // shift needed to get our bits on low positions - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) + maskHigh uint64 // same than mask, for index+1 + shiftHigh uint64 // same than shift, for index+1 +} - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { + toReturn := make([]fr.Element, len(scalars)) - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) } + selectors[chunk] = d } - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int - chRes <- total + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.FitsOnOneWord() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } -} + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] -func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // init with carry if any + digit := carry + carry = 0 - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G2Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g2JacExtended, 1<<(lastC-1)) - // TODO @gbotrel last C restore. - msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars) - } + // if digit is zero, no impact on result + if digit == 0 { + continue + } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + var bits uint64 + if digit >= 0 { + bits = uint64(digit) + } else { + bits = uint64(-digit-1) | msbWindow + } - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) + toReturn[i][s.index] |= (bits << s.shift) + if s.multiWordSelect { + toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues } diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index 4becea22f3..583761fe76 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -17,11 +17,7 @@ package bls12378 import ( - "errors" - "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-378/fr" - "math" - "runtime" ) const MAX_BATCH_SIZE = 600 @@ -34,320 +30,13 @@ func (o batchOp) isNeg() bool { return o.pointID&1 == 1 } -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// processChunkG1BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. // -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { - var _p G1Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G1Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 1: - msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - - case 4: - msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - - case 5: - msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - - case 8: - msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - - case 10: - batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - - case 11: - batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - - case 12: - batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - - case 13: - batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - - case 14: - batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - - case 15: - batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - - case 16: - batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - - case 18: - batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - - case 19: - batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - - case 20: - batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - - case 21: - batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - - case 22: - batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - - case 23: - batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } -} - -type BatchG1Affine[B ibG1Affine] struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G1Affine - buckets *B -} - -func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { - batchSize := len(*buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG1Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} - -func (b *BatchG1Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG1Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG1Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} - -func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -441,66 +130,8 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, } -func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g1JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G1Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g1JacExtended, 1<<(lastC-1)) - // TODO @gbotrel lastC restore. - msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, int(c), chChunks[:]) -} - -type bucketG1AffineC1 [1 << (1 - 1)]G1Affine -type bucketG1AffineC2 [1 << (2 - 1)]G1Affine -type bucketG1AffineC3 [1 << (3 - 1)]G1Affine +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack type bucketG1AffineC4 [1 << (4 - 1)]G1Affine type bucketG1AffineC5 [1 << (5 - 1)]G1Affine type bucketG1AffineC6 [1 << (6 - 1)]G1Affine @@ -514,42 +145,11 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine type bucketG1AffineC14 [1 << (14 - 1)]G1Affine type bucketG1AffineC15 [1 << (15 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine -type bucketG1AffineC17 [1 << (17 - 1)]G1Affine -type bucketG1AffineC18 [1 << (18 - 1)]G1Affine -type bucketG1AffineC19 [1 << (19 - 1)]G1Affine type bucketG1AffineC20 [1 << (20 - 1)]G1Affine type bucketG1AffineC21 [1 << (21 - 1)]G1Affine -type bucketG1AffineC22 [1 << (22 - 1)]G1Affine -type bucketG1AffineC23 [1 << (23 - 1)]G1Affine -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended -type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended -type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended -type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended -type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended -type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended -type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended -type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended -type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended -type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended -type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended -type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended -type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended -type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended -type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended -type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended -type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended -type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended -type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended type ibG1Affine interface { - bucketG1AffineC1 | - bucketG1AffineC2 | - bucketG1AffineC3 | - bucketG1AffineC4 | + bucketG1AffineC4 | bucketG1AffineC5 | bucketG1AffineC6 | bucketG1AffineC7 | @@ -562,258 +162,21 @@ type ibG1Affine interface { bucketG1AffineC14 | bucketG1AffineC15 | bucketG1AffineC16 | - bucketG1AffineC17 | - bucketG1AffineC18 | - bucketG1AffineC19 | bucketG1AffineC20 | - bucketG1AffineC21 | - bucketG1AffineC22 | - bucketG1AffineC23 -} - -type ibg1JacExtended interface { - bucketg1JacExtendedC1 | - bucketg1JacExtendedC2 | - bucketg1JacExtendedC3 | - bucketg1JacExtendedC4 | - bucketg1JacExtendedC5 | - bucketg1JacExtendedC6 | - bucketg1JacExtendedC7 | - bucketg1JacExtendedC8 | - bucketg1JacExtendedC9 | - bucketg1JacExtendedC10 | - bucketg1JacExtendedC11 | - bucketg1JacExtendedC12 | - bucketg1JacExtendedC13 | - bucketg1JacExtendedC14 | - bucketg1JacExtendedC15 | - bucketg1JacExtendedC16 | - bucketg1JacExtendedC17 | - bucketg1JacExtendedC18 | - bucketg1JacExtendedC19 | - bucketg1JacExtendedC20 | - bucketg1JacExtendedC21 | - bucketg1JacExtendedC22 | - bucketg1JacExtendedC23 -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 1: - msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - - case 4: - msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - - case 5: - msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - - case 8: - msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - - case 10: - batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - - case 11: - batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - - case 12: - batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - - case 13: - batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - - case 14: - batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - - case 15: - batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - - case 16: - batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - - case 18: - batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - - case 19: - batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - - case 20: - batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - - case 21: - batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - - case 22: - batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - - case 23: - batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } + bucketG1AffineC21 } -type BatchG2Affine[B ibG2Affine] struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine +type BatchG1Affine[B ibG1Affine] struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine batchSize int cptP int bucketIds map[uint32]struct{} - points []G2Affine + points []G1Affine buckets *B } -func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { +func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { batchSize := len(*buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -821,7 +184,7 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine if batchSize <= 0 { batchSize = 1 } - return BatchG2Affine[B]{ + return BatchG1Affine[B]{ buckets: buckets, points: points, batchSize: batchSize, @@ -829,18 +192,18 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine } } -func (b *BatchG2Affine[B]) IsFull() bool { +func (b *BatchG1Affine[B]) IsFull() bool { return b.cptP == b.batchSize } -func (b *BatchG2Affine[B]) ExecuteAndReset() { +func (b *BatchG1Affine[B]) ExecuteAndReset() { if b.cptP == 0 { return } // for i := 0; i < len(b.R); i++ { // b.R[i].Add(b.R[i], b.P[i]) // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) for k := range b.bucketIds { delete(b.bucketIds, k) } @@ -848,12 +211,12 @@ func (b *BatchG2Affine[B]) ExecuteAndReset() { b.cptP = 0 } -func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { +func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { _, ok := b.bucketIds[bID] return !ok } -func (b *BatchG2Affine[B]) Add(op batchOp) { +func (b *BatchG1Affine[B]) Add(op batchOp) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &(*b.buckets)[op.bucketID] @@ -895,7 +258,7 @@ func (b *BatchG2Affine[B]) Add(op batchOp) { b.cptP++ } -func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { +func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { for i := len(queue) - 1; i >= 0; i-- { if batch.CanAdd(queue[i].bucketID) { batch.Add(queue[i]) @@ -910,7 +273,13 @@ func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B] } -func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, +// processChunkG2BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. +// +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -1004,66 +373,8 @@ func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, } -func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G2Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g2JacExtended, 1<<(lastC-1)) - // TODO @gbotrel lastC restore. - msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) -} - -type bucketG2AffineC1 [1 << (1 - 1)]G2Affine -type bucketG2AffineC2 [1 << (2 - 1)]G2Affine -type bucketG2AffineC3 [1 << (3 - 1)]G2Affine +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack type bucketG2AffineC4 [1 << (4 - 1)]G2Affine type bucketG2AffineC5 [1 << (5 - 1)]G2Affine type bucketG2AffineC6 [1 << (6 - 1)]G2Affine @@ -1077,42 +388,11 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine type bucketG2AffineC14 [1 << (14 - 1)]G2Affine type bucketG2AffineC15 [1 << (15 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine -type bucketG2AffineC17 [1 << (17 - 1)]G2Affine -type bucketG2AffineC18 [1 << (18 - 1)]G2Affine -type bucketG2AffineC19 [1 << (19 - 1)]G2Affine type bucketG2AffineC20 [1 << (20 - 1)]G2Affine type bucketG2AffineC21 [1 << (21 - 1)]G2Affine -type bucketG2AffineC22 [1 << (22 - 1)]G2Affine -type bucketG2AffineC23 [1 << (23 - 1)]G2Affine -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended -type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended -type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended -type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended -type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended -type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended -type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended -type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended -type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended -type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended -type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended -type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended -type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended -type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended -type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended -type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended -type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended -type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended -type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended type ibG2Affine interface { - bucketG2AffineC1 | - bucketG2AffineC2 | - bucketG2AffineC3 | - bucketG2AffineC4 | + bucketG2AffineC4 | bucketG2AffineC5 | bucketG2AffineC6 | bucketG2AffineC7 | @@ -1125,37 +405,113 @@ type ibG2Affine interface { bucketG2AffineC14 | bucketG2AffineC15 | bucketG2AffineC16 | - bucketG2AffineC17 | - bucketG2AffineC18 | - bucketG2AffineC19 | bucketG2AffineC20 | - bucketG2AffineC21 | - bucketG2AffineC22 | - bucketG2AffineC23 + bucketG2AffineC21 +} + +type BatchG2Affine[B ibG2Affine] struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G2Affine + buckets *B } -type ibg2JacExtended interface { - bucketg2JacExtendedC1 | - bucketg2JacExtendedC2 | - bucketg2JacExtendedC3 | - bucketg2JacExtendedC4 | - bucketg2JacExtendedC5 | - bucketg2JacExtendedC6 | - bucketg2JacExtendedC7 | - bucketg2JacExtendedC8 | - bucketg2JacExtendedC9 | - bucketg2JacExtendedC10 | - bucketg2JacExtendedC11 | - bucketg2JacExtendedC12 | - bucketg2JacExtendedC13 | - bucketg2JacExtendedC14 | - bucketg2JacExtendedC15 | - bucketg2JacExtendedC16 | - bucketg2JacExtendedC17 | - bucketg2JacExtendedC18 | - bucketg2JacExtendedC19 | - bucketg2JacExtendedC20 | - bucketg2JacExtendedC21 | - bucketg2JacExtendedC22 | - bucketg2JacExtendedC23 +func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { + batchSize := len(*buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG2Affine[B]{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), + } +} + +func (b *BatchG2Affine[B]) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG2Affine[B]) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG2Affine[B]) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &(*b.buckets)[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(P) + } else { + BK.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(P) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = BK + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + } diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go new file mode 100644 index 0000000000..a26fe93845 --- /dev/null +++ b/ecc/bls12-378/multiexp_jacobian.go @@ -0,0 +1,229 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package bls12378 + +import ( + "github.com/consensys/gnark-crypto/ecc/bls12-378/fr" +) + +func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, + chRes chan<- g1JacExtended, + c uint64, + points []G1Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended +type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended +type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended +type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended +type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended +type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended +type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended +type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended +type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended +type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended +type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended +type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended +type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended + +type ibg1JacExtended interface { + bucketg1JacExtendedC1 | + bucketg1JacExtendedC3 | + bucketg1JacExtendedC4 | + bucketg1JacExtendedC5 | + bucketg1JacExtendedC6 | + bucketg1JacExtendedC7 | + bucketg1JacExtendedC8 | + bucketg1JacExtendedC9 | + bucketg1JacExtendedC10 | + bucketg1JacExtendedC11 | + bucketg1JacExtendedC12 | + bucketg1JacExtendedC13 | + bucketg1JacExtendedC14 | + bucketg1JacExtendedC15 | + bucketg1JacExtendedC16 | + bucketg1JacExtendedC20 | + bucketg1JacExtendedC21 +} + +func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended +type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended +type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended +type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended +type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended +type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended +type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended +type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended +type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended +type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended +type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended +type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended +type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended + +type ibg2JacExtended interface { + bucketg2JacExtendedC1 | + bucketg2JacExtendedC3 | + bucketg2JacExtendedC4 | + bucketg2JacExtendedC5 | + bucketg2JacExtendedC6 | + bucketg2JacExtendedC7 | + bucketg2JacExtendedC8 | + bucketg2JacExtendedC9 | + bucketg2JacExtendedC10 | + bucketg2JacExtendedC11 | + bucketg2JacExtendedC12 | + bucketg2JacExtendedC13 | + bucketg2JacExtendedC14 | + bucketg2JacExtendedC15 | + bucketg2JacExtendedC16 | + bucketg2JacExtendedC20 | + bucketg2JacExtendedC21 +} diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index c4acf67088..8a80c9d1f8 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) + innerMsmG1(&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} + cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -130,10 +130,10 @@ func TestMultiExpG1(t *testing.T) { results := make([]G1Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&results[i], int(c), samplePoints[:], scalars, false) + innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG1Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true) + innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -171,10 +171,10 @@ func TestMultiExpG1(t *testing.T) { results := make([]G1Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&results[i], int(c), samplePointsZero[:], scalars, false) + innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG1Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) + innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -209,8 +209,8 @@ func TestMultiExpG1(t *testing.T) { var result1, result2 G1Jac for _, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false) - msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + innerMsmG1(&result1, int(c), samplePoints[:], scalars, false) + innerMsmG1(&result2, int(c), samplePoints[:], scalars, false) if !result1.Equal(&result2) { return false } @@ -288,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) { b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { - testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) } @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) + innerMsmG2(&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -461,10 +461,10 @@ func TestMultiExpG2(t *testing.T) { results := make([]G2Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&results[i], int(c), samplePoints[:], scalars, false) + innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG2Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true) + innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -502,10 +502,10 @@ func TestMultiExpG2(t *testing.T) { results := make([]G2Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&results[i], int(c), samplePointsZero[:], scalars, false) + innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG2Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) + innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -540,8 +540,8 @@ func TestMultiExpG2(t *testing.T) { var result1, result2 G2Jac for _, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false) - msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + innerMsmG2(&result1, int(c), samplePoints[:], scalars, false) + innerMsmG2(&result2, int(c), samplePoints[:], scalars, false) if !result1.Equal(&result2) { return false } @@ -619,7 +619,7 @@ func BenchmarkMultiExpG2(b *testing.B) { b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { - testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) } diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index a66bb3aa70..d926dc8e2e 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -25,143 +25,6 @@ import ( "runtime" ) -// selector stores the index, mask and shifts needed to select bits from a scalar -// it is used during the multiExp algorithm or the batch scalar multiplication -type selector struct { - index uint64 // index in the multi-word scalar to select bits from - mask uint64 // mask (c-bit wide) - shift uint64 // shift needed to get our bits on low positions - - multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) - maskHigh uint64 // same than mask, for index+1 - shiftHigh uint64 // same than shift, for index+1 -} - -// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits -// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract -// 2^{c} to the current digit, making it negative. -// negative digits can be processed in a later step as adding -G into the bucket instead of G -// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) -// scalarsMont indicates wheter the provided scalars are in montgomery form -// returns smallValues, which represent the number of scalars which meets the following condition -// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { - toReturn := make([]fr.Element, len(scalars)) - - // number of c-bit radixes in a scalar - nbChunks := fr.Limbs * 64 / c - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window - max := int(1 << (c - 1)) // max value we want for our digits - cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words - - // compute offset and word selector / shift to select the right bits of our windows - selectors := make([]selector, nbChunks) - for chunk := uint64(0); chunk < nbChunks; chunk++ { - jc := uint64(chunk * c) - d := selector{} - d.index = jc / 64 - d.shift = jc - (d.index * 64) - d.mask = mask << d.shift - d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) - if d.multiWordSelect { - nbBitsHigh := d.shift - uint64(64-c) - d.maskHigh = (1 << nbBitsHigh) - 1 - d.shiftHigh = (c - nbBitsHigh) - } - selectors[chunk] = d - } - - // for each chunk, we could track the number of non-zeros points we will need to process - // this way, if a chunk has more work to do than others, we can spawn off more go routines - // (at the cost of more buckets allocated) - // a simplified approach is to track the small values where only the first word is set - // if this number represent a significant number of points, then we will split first chunk - // processing in the msm in 2, to ensure all go routines finish at ~same time - // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine - // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) - - parallel.Execute(len(scalars), func(start, end int) { - smallValues := 0 - for i := start; i < end; i++ { - var carry int - - scalar := scalars[i] - if scalarsMont { - scalar.FromMont() - } - if scalar.FitsOnOneWord() { - // everything is 0, no need to process this scalar - if scalar[0] == 0 { - continue - } - // low c-bits are 1 in mask - if scalar[0]&mask == scalar[0] { - smallValues++ - } - } - - // for each chunk in the scalar, compute the current digit, and an eventual carry - for chunk := uint64(0); chunk < nbChunks; chunk++ { - s := selectors[chunk] - - // init with carry if any - digit := carry - carry = 0 - - // digit = value of the c-bit window - digit += int((scalar[s.index] & s.mask) >> s.shift) - - if s.multiWordSelect { - // we are selecting bits over 2 words - digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh - } - - // if digit is zero, no impact on result - if digit == 0 { - continue - } - - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - if digit >= max { - digit -= (1 << c) - carry = 1 - } - - var bits uint64 - if digit >= 0 { - bits = uint64(digit) - } else { - bits = uint64(-digit-1) | msbWindow - } - - toReturn[i][s.index] |= (bits << s.shift) - if s.multiWordSelect { - toReturn[i][s.index+1] |= (bits >> s.shiftHigh) - } - - } - } - - chSmallValues <- smallValues - - }, nbTasks) - - // aggregate small values - close(chSmallValues) - smallValues := 0 - for o := range chSmallValues { - smallValues += o - } - return toReturn, smallValues -} - // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf // // This call return an error if len(scalars) != len(points) or if provided config is invalid. @@ -221,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -266,7 +129,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG1Jac , but that would incur a cost of looping through all scalars one more time + // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 // we have nbSplits intermediate results that we must sum together. @@ -276,12 +139,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul start := i * nbPoints end := start + nbPoints go func(start, end, i int) { - msmInnerG1Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) chDone <- i }(start, end, i) } - msmInnerG1Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) for i := 0; i < nbSplits-1; i++ { done := <-chDone p.AddAssign(&_p[done]) @@ -290,169 +153,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { - +func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { switch c { - case 1: - msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - case 4: - msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) case 5: - msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] + _innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 6: - msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 7: - msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 8: - msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] + _innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) case 9: - msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 10: - msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC10] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] + _innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 11: - msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC11] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] + _innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 12: - msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC12] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 13: - msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC13] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] + _innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 14: - msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC14] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 15: - msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC15] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] + _innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 16: - msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - - case 18: - msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - - case 19: - msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC16] + _innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) case 20: - msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC20] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] + _innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 21: - msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - - case 22: - msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - - case 23: - msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC21] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { - var _p g1JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64, - chRes chan<- g1JacExtended, - c uint64, - points []G1Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac { - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) - } - } - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g1JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { nbChunks++ } + // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is @@ -464,45 +237,54 @@ func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, poi chChunks[i] = make(chan g1JacExtended, 1) } - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G1Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g1JacExtended, 1<<(lastC-1)) - // TODO @gbotrel last C restore. - msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } + // the last chunk may be processed with a different method than the rest, as it could be smaller. + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars) + for j := int(nbChunks - 2); j > 0; j-- { + go processChunk(uint64(j), chChunks[j], c, points, scalars) } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] + // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed + // in the ~same amount of time + if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + if !splitFirstChunk { + go processChunk(0, chChunks[0], c, points, scalars) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, chSplit, c, points[:split], scalars[:split]) + go processChunk(0, chSplit, c, points[split:], scalars[split:]) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } +// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { + var _p g1JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf // // This call return an error if len(scalars) != len(points) or if provided config is invalid. @@ -562,7 +344,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -607,7 +389,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time + // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 // we have nbSplits intermediate results that we must sum together. @@ -617,12 +399,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul start := i * nbPoints end := start + nbPoints go func(start, end, i int) { - msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) chDone <- i }(start, end, i) } - msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) for i := 0; i < nbSplits-1; i++ { done := <-chDone p.AddAssign(&_p[done]) @@ -631,82 +413,120 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - +func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { switch c { - case 1: - msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - case 4: - msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) case 5: - msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] + _innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 6: - msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 7: - msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 8: - msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] + _innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) case 9: - msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 10: - msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC10] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] + _innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 11: - msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC11] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] + _innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 12: - msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC12] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 13: - msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC13] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] + _innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 14: - msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC14] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 15: - msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC15] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] + _innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 16: - msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + processChunk := processChunkG2BatchAffine[bucketG2AffineC16] + _innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + case 20: + processChunk := processChunkG2BatchAffine[bucketG2AffineC20] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] + _innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) + case 21: + processChunk := processChunkG2BatchAffine[bucketG2AffineC21] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) + default: + panic("not implemented") + } +} - case 17: - msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac { - case 18: - msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } - case 19: - msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance - case 20: - msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } - case 21: - msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) + // the last chunk may be processed with a different method than the rest, as it could be smaller. + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) - case 22: - msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) + for j := int(nbChunks - 2); j > 0; j-- { + go processChunk(uint64(j), chChunks[j], c, points, scalars) + } - case 23: - msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) + // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] + // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed + // in the ~same amount of time + if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + if !splitFirstChunk { + go processChunk(0, chChunks[0], c, points, scalars) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, chSplit, c, points[:split], scalars[:split]) + go processChunk(0, chSplit, c, points[split:], scalars[split:]) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } - default: - panic("not implemented") } + + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp @@ -725,121 +545,139 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - scalars []fr.Element) { +// selector stores the index, mask and shifts needed to select bits from a scalar +// it is used during the multiExp algorithm or the batch scalar multiplication +type selector struct { + index uint64 // index in the multi-word scalar to select bits from + mask uint64 // mask (c-bit wide) + shift uint64 // shift needed to get our bits on low positions - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) + maskHigh uint64 // same than mask, for index+1 + shiftHigh uint64 // same than shift, for index+1 +} - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { + toReturn := make([]fr.Element, len(scalars)) - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) } + selectors[chunk] = d } - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int - chRes <- total + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.FitsOnOneWord() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } -} + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] -func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // init with carry if any + digit := carry + carry = 0 - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G2Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g2JacExtended, 1<<(lastC-1)) - // TODO @gbotrel last C restore. - msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars) - } + // if digit is zero, no impact on result + if digit == 0 { + continue + } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + var bits uint64 + if digit >= 0 { + bits = uint64(digit) + } else { + bits = uint64(-digit-1) | msbWindow + } - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) + toReturn[i][s.index] |= (bits << s.shift) + if s.multiWordSelect { + toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues } diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index d9469a7fb1..36695009a0 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -17,11 +17,7 @@ package bls12381 import ( - "errors" - "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-381/fr" - "math" - "runtime" ) const MAX_BATCH_SIZE = 600 @@ -34,320 +30,13 @@ func (o batchOp) isNeg() bool { return o.pointID&1 == 1 } -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// processChunkG1BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. // -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { - var _p G1Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G1Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 1: - msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - - case 4: - msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - - case 5: - msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - - case 8: - msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - - case 10: - batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - - case 11: - batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - - case 12: - batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - - case 13: - batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - - case 14: - batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - - case 15: - batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - - case 16: - batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - - case 18: - batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - - case 19: - batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - - case 20: - batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - - case 21: - batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - - case 22: - batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - - case 23: - batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } -} - -type BatchG1Affine[B ibG1Affine] struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G1Affine - buckets *B -} - -func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { - batchSize := len(*buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG1Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} - -func (b *BatchG1Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG1Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG1Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} - -func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -441,66 +130,8 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, } -func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g1JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G1Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g1JacExtended, 1<<(lastC-1)) - // TODO @gbotrel lastC restore. - msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, int(c), chChunks[:]) -} - -type bucketG1AffineC1 [1 << (1 - 1)]G1Affine -type bucketG1AffineC2 [1 << (2 - 1)]G1Affine -type bucketG1AffineC3 [1 << (3 - 1)]G1Affine +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack type bucketG1AffineC4 [1 << (4 - 1)]G1Affine type bucketG1AffineC5 [1 << (5 - 1)]G1Affine type bucketG1AffineC6 [1 << (6 - 1)]G1Affine @@ -514,42 +145,11 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine type bucketG1AffineC14 [1 << (14 - 1)]G1Affine type bucketG1AffineC15 [1 << (15 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine -type bucketG1AffineC17 [1 << (17 - 1)]G1Affine -type bucketG1AffineC18 [1 << (18 - 1)]G1Affine -type bucketG1AffineC19 [1 << (19 - 1)]G1Affine type bucketG1AffineC20 [1 << (20 - 1)]G1Affine type bucketG1AffineC21 [1 << (21 - 1)]G1Affine -type bucketG1AffineC22 [1 << (22 - 1)]G1Affine -type bucketG1AffineC23 [1 << (23 - 1)]G1Affine -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended -type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended -type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended -type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended -type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended -type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended -type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended -type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended -type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended -type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended -type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended -type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended -type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended -type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended -type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended -type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended -type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended -type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended -type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended type ibG1Affine interface { - bucketG1AffineC1 | - bucketG1AffineC2 | - bucketG1AffineC3 | - bucketG1AffineC4 | + bucketG1AffineC4 | bucketG1AffineC5 | bucketG1AffineC6 | bucketG1AffineC7 | @@ -562,258 +162,21 @@ type ibG1Affine interface { bucketG1AffineC14 | bucketG1AffineC15 | bucketG1AffineC16 | - bucketG1AffineC17 | - bucketG1AffineC18 | - bucketG1AffineC19 | bucketG1AffineC20 | - bucketG1AffineC21 | - bucketG1AffineC22 | - bucketG1AffineC23 -} - -type ibg1JacExtended interface { - bucketg1JacExtendedC1 | - bucketg1JacExtendedC2 | - bucketg1JacExtendedC3 | - bucketg1JacExtendedC4 | - bucketg1JacExtendedC5 | - bucketg1JacExtendedC6 | - bucketg1JacExtendedC7 | - bucketg1JacExtendedC8 | - bucketg1JacExtendedC9 | - bucketg1JacExtendedC10 | - bucketg1JacExtendedC11 | - bucketg1JacExtendedC12 | - bucketg1JacExtendedC13 | - bucketg1JacExtendedC14 | - bucketg1JacExtendedC15 | - bucketg1JacExtendedC16 | - bucketg1JacExtendedC17 | - bucketg1JacExtendedC18 | - bucketg1JacExtendedC19 | - bucketg1JacExtendedC20 | - bucketg1JacExtendedC21 | - bucketg1JacExtendedC22 | - bucketg1JacExtendedC23 -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 1: - msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - - case 4: - msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - - case 5: - msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - - case 8: - msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - - case 10: - batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - - case 11: - batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - - case 12: - batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - - case 13: - batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - - case 14: - batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - - case 15: - batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - - case 16: - batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - - case 18: - batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - - case 19: - batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - - case 20: - batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - - case 21: - batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - - case 22: - batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - - case 23: - batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } + bucketG1AffineC21 } -type BatchG2Affine[B ibG2Affine] struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine +type BatchG1Affine[B ibG1Affine] struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine batchSize int cptP int bucketIds map[uint32]struct{} - points []G2Affine + points []G1Affine buckets *B } -func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { +func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { batchSize := len(*buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -821,7 +184,7 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine if batchSize <= 0 { batchSize = 1 } - return BatchG2Affine[B]{ + return BatchG1Affine[B]{ buckets: buckets, points: points, batchSize: batchSize, @@ -829,18 +192,18 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine } } -func (b *BatchG2Affine[B]) IsFull() bool { +func (b *BatchG1Affine[B]) IsFull() bool { return b.cptP == b.batchSize } -func (b *BatchG2Affine[B]) ExecuteAndReset() { +func (b *BatchG1Affine[B]) ExecuteAndReset() { if b.cptP == 0 { return } // for i := 0; i < len(b.R); i++ { // b.R[i].Add(b.R[i], b.P[i]) // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) for k := range b.bucketIds { delete(b.bucketIds, k) } @@ -848,12 +211,12 @@ func (b *BatchG2Affine[B]) ExecuteAndReset() { b.cptP = 0 } -func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { +func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { _, ok := b.bucketIds[bID] return !ok } -func (b *BatchG2Affine[B]) Add(op batchOp) { +func (b *BatchG1Affine[B]) Add(op batchOp) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &(*b.buckets)[op.bucketID] @@ -895,7 +258,7 @@ func (b *BatchG2Affine[B]) Add(op batchOp) { b.cptP++ } -func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { +func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { for i := len(queue) - 1; i >= 0; i-- { if batch.CanAdd(queue[i].bucketID) { batch.Add(queue[i]) @@ -910,7 +273,13 @@ func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B] } -func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, +// processChunkG2BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. +// +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -1004,66 +373,8 @@ func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, } -func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G2Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g2JacExtended, 1<<(lastC-1)) - // TODO @gbotrel lastC restore. - msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) -} - -type bucketG2AffineC1 [1 << (1 - 1)]G2Affine -type bucketG2AffineC2 [1 << (2 - 1)]G2Affine -type bucketG2AffineC3 [1 << (3 - 1)]G2Affine +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack type bucketG2AffineC4 [1 << (4 - 1)]G2Affine type bucketG2AffineC5 [1 << (5 - 1)]G2Affine type bucketG2AffineC6 [1 << (6 - 1)]G2Affine @@ -1077,42 +388,11 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine type bucketG2AffineC14 [1 << (14 - 1)]G2Affine type bucketG2AffineC15 [1 << (15 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine -type bucketG2AffineC17 [1 << (17 - 1)]G2Affine -type bucketG2AffineC18 [1 << (18 - 1)]G2Affine -type bucketG2AffineC19 [1 << (19 - 1)]G2Affine type bucketG2AffineC20 [1 << (20 - 1)]G2Affine type bucketG2AffineC21 [1 << (21 - 1)]G2Affine -type bucketG2AffineC22 [1 << (22 - 1)]G2Affine -type bucketG2AffineC23 [1 << (23 - 1)]G2Affine -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended -type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended -type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended -type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended -type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended -type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended -type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended -type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended -type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended -type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended -type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended -type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended -type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended -type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended -type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended -type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended -type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended -type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended -type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended type ibG2Affine interface { - bucketG2AffineC1 | - bucketG2AffineC2 | - bucketG2AffineC3 | - bucketG2AffineC4 | + bucketG2AffineC4 | bucketG2AffineC5 | bucketG2AffineC6 | bucketG2AffineC7 | @@ -1125,37 +405,113 @@ type ibG2Affine interface { bucketG2AffineC14 | bucketG2AffineC15 | bucketG2AffineC16 | - bucketG2AffineC17 | - bucketG2AffineC18 | - bucketG2AffineC19 | bucketG2AffineC20 | - bucketG2AffineC21 | - bucketG2AffineC22 | - bucketG2AffineC23 + bucketG2AffineC21 +} + +type BatchG2Affine[B ibG2Affine] struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G2Affine + buckets *B } -type ibg2JacExtended interface { - bucketg2JacExtendedC1 | - bucketg2JacExtendedC2 | - bucketg2JacExtendedC3 | - bucketg2JacExtendedC4 | - bucketg2JacExtendedC5 | - bucketg2JacExtendedC6 | - bucketg2JacExtendedC7 | - bucketg2JacExtendedC8 | - bucketg2JacExtendedC9 | - bucketg2JacExtendedC10 | - bucketg2JacExtendedC11 | - bucketg2JacExtendedC12 | - bucketg2JacExtendedC13 | - bucketg2JacExtendedC14 | - bucketg2JacExtendedC15 | - bucketg2JacExtendedC16 | - bucketg2JacExtendedC17 | - bucketg2JacExtendedC18 | - bucketg2JacExtendedC19 | - bucketg2JacExtendedC20 | - bucketg2JacExtendedC21 | - bucketg2JacExtendedC22 | - bucketg2JacExtendedC23 +func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { + batchSize := len(*buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG2Affine[B]{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), + } +} + +func (b *BatchG2Affine[B]) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG2Affine[B]) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG2Affine[B]) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &(*b.buckets)[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(P) + } else { + BK.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(P) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = BK + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + } diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go new file mode 100644 index 0000000000..a4e61348b7 --- /dev/null +++ b/ecc/bls12-381/multiexp_jacobian.go @@ -0,0 +1,229 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package bls12381 + +import ( + "github.com/consensys/gnark-crypto/ecc/bls12-381/fr" +) + +func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, + chRes chan<- g1JacExtended, + c uint64, + points []G1Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended +type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended +type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended +type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended +type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended +type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended +type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended +type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended +type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended +type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended +type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended +type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended +type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended + +type ibg1JacExtended interface { + bucketg1JacExtendedC1 | + bucketg1JacExtendedC3 | + bucketg1JacExtendedC4 | + bucketg1JacExtendedC5 | + bucketg1JacExtendedC6 | + bucketg1JacExtendedC7 | + bucketg1JacExtendedC8 | + bucketg1JacExtendedC9 | + bucketg1JacExtendedC10 | + bucketg1JacExtendedC11 | + bucketg1JacExtendedC12 | + bucketg1JacExtendedC13 | + bucketg1JacExtendedC14 | + bucketg1JacExtendedC15 | + bucketg1JacExtendedC16 | + bucketg1JacExtendedC20 | + bucketg1JacExtendedC21 +} + +func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended +type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended +type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended +type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended +type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended +type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended +type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended +type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended +type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended +type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended +type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended +type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended +type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended + +type ibg2JacExtended interface { + bucketg2JacExtendedC1 | + bucketg2JacExtendedC3 | + bucketg2JacExtendedC4 | + bucketg2JacExtendedC5 | + bucketg2JacExtendedC6 | + bucketg2JacExtendedC7 | + bucketg2JacExtendedC8 | + bucketg2JacExtendedC9 | + bucketg2JacExtendedC10 | + bucketg2JacExtendedC11 | + bucketg2JacExtendedC12 | + bucketg2JacExtendedC13 | + bucketg2JacExtendedC14 | + bucketg2JacExtendedC15 | + bucketg2JacExtendedC16 | + bucketg2JacExtendedC20 | + bucketg2JacExtendedC21 +} diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index 4248afb29d..15cd0f5304 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) + innerMsmG1(&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} + cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -130,10 +130,10 @@ func TestMultiExpG1(t *testing.T) { results := make([]G1Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&results[i], int(c), samplePoints[:], scalars, false) + innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG1Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true) + innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -171,10 +171,10 @@ func TestMultiExpG1(t *testing.T) { results := make([]G1Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&results[i], int(c), samplePointsZero[:], scalars, false) + innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG1Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) + innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -209,8 +209,8 @@ func TestMultiExpG1(t *testing.T) { var result1, result2 G1Jac for _, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false) - msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + innerMsmG1(&result1, int(c), samplePoints[:], scalars, false) + innerMsmG1(&result2, int(c), samplePoints[:], scalars, false) if !result1.Equal(&result2) { return false } @@ -288,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) { b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { - testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) } @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) + innerMsmG2(&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -461,10 +461,10 @@ func TestMultiExpG2(t *testing.T) { results := make([]G2Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&results[i], int(c), samplePoints[:], scalars, false) + innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG2Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true) + innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -502,10 +502,10 @@ func TestMultiExpG2(t *testing.T) { results := make([]G2Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&results[i], int(c), samplePointsZero[:], scalars, false) + innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG2Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) + innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -540,8 +540,8 @@ func TestMultiExpG2(t *testing.T) { var result1, result2 G2Jac for _, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false) - msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + innerMsmG2(&result1, int(c), samplePoints[:], scalars, false) + innerMsmG2(&result2, int(c), samplePoints[:], scalars, false) if !result1.Equal(&result2) { return false } @@ -619,7 +619,7 @@ func BenchmarkMultiExpG2(b *testing.B) { b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { - testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) } diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index f97aa4e2f4..3686207518 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -25,143 +25,6 @@ import ( "runtime" ) -// selector stores the index, mask and shifts needed to select bits from a scalar -// it is used during the multiExp algorithm or the batch scalar multiplication -type selector struct { - index uint64 // index in the multi-word scalar to select bits from - mask uint64 // mask (c-bit wide) - shift uint64 // shift needed to get our bits on low positions - - multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) - maskHigh uint64 // same than mask, for index+1 - shiftHigh uint64 // same than shift, for index+1 -} - -// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits -// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract -// 2^{c} to the current digit, making it negative. -// negative digits can be processed in a later step as adding -G into the bucket instead of G -// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) -// scalarsMont indicates wheter the provided scalars are in montgomery form -// returns smallValues, which represent the number of scalars which meets the following condition -// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { - toReturn := make([]fr.Element, len(scalars)) - - // number of c-bit radixes in a scalar - nbChunks := fr.Limbs * 64 / c - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window - max := int(1 << (c - 1)) // max value we want for our digits - cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words - - // compute offset and word selector / shift to select the right bits of our windows - selectors := make([]selector, nbChunks) - for chunk := uint64(0); chunk < nbChunks; chunk++ { - jc := uint64(chunk * c) - d := selector{} - d.index = jc / 64 - d.shift = jc - (d.index * 64) - d.mask = mask << d.shift - d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) - if d.multiWordSelect { - nbBitsHigh := d.shift - uint64(64-c) - d.maskHigh = (1 << nbBitsHigh) - 1 - d.shiftHigh = (c - nbBitsHigh) - } - selectors[chunk] = d - } - - // for each chunk, we could track the number of non-zeros points we will need to process - // this way, if a chunk has more work to do than others, we can spawn off more go routines - // (at the cost of more buckets allocated) - // a simplified approach is to track the small values where only the first word is set - // if this number represent a significant number of points, then we will split first chunk - // processing in the msm in 2, to ensure all go routines finish at ~same time - // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine - // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) - - parallel.Execute(len(scalars), func(start, end int) { - smallValues := 0 - for i := start; i < end; i++ { - var carry int - - scalar := scalars[i] - if scalarsMont { - scalar.FromMont() - } - if scalar.FitsOnOneWord() { - // everything is 0, no need to process this scalar - if scalar[0] == 0 { - continue - } - // low c-bits are 1 in mask - if scalar[0]&mask == scalar[0] { - smallValues++ - } - } - - // for each chunk in the scalar, compute the current digit, and an eventual carry - for chunk := uint64(0); chunk < nbChunks; chunk++ { - s := selectors[chunk] - - // init with carry if any - digit := carry - carry = 0 - - // digit = value of the c-bit window - digit += int((scalar[s.index] & s.mask) >> s.shift) - - if s.multiWordSelect { - // we are selecting bits over 2 words - digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh - } - - // if digit is zero, no impact on result - if digit == 0 { - continue - } - - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - if digit >= max { - digit -= (1 << c) - carry = 1 - } - - var bits uint64 - if digit >= 0 { - bits = uint64(digit) - } else { - bits = uint64(-digit-1) | msbWindow - } - - toReturn[i][s.index] |= (bits << s.shift) - if s.multiWordSelect { - toReturn[i][s.index+1] |= (bits >> s.shiftHigh) - } - - } - } - - chSmallValues <- smallValues - - }, nbTasks) - - // aggregate small values - close(chSmallValues) - smallValues := 0 - for o := range chSmallValues { - smallValues += o - } - return toReturn, smallValues -} - // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf // // This call return an error if len(scalars) != len(points) or if provided config is invalid. @@ -221,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -266,7 +129,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG1Jac , but that would incur a cost of looping through all scalars one more time + // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 // we have nbSplits intermediate results that we must sum together. @@ -276,12 +139,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul start := i * nbPoints end := start + nbPoints go func(start, end, i int) { - msmInnerG1Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) chDone <- i }(start, end, i) } - msmInnerG1Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) for i := 0; i < nbSplits-1; i++ { done := <-chDone p.AddAssign(&_p[done]) @@ -290,169 +153,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { - +func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { switch c { - case 1: - msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - case 4: - msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) case 5: - msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] + _innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 6: - msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 7: - msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 8: - msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] + _innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) case 9: - msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 10: - msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC10] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] + _innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 11: - msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC11] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] + _innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 12: - msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC12] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 13: - msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC13] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] + _innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 14: - msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC14] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 15: - msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC15] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] + _innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 16: - msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - - case 18: - msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - - case 19: - msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC16] + _innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) case 20: - msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC20] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] + _innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 21: - msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - - case 22: - msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - - case 23: - msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC21] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { - var _p g1JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64, - chRes chan<- g1JacExtended, - c uint64, - points []G1Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac { - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) - } - } - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g1JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { nbChunks++ } + // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is @@ -464,45 +237,54 @@ func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, poi chChunks[i] = make(chan g1JacExtended, 1) } - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G1Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g1JacExtended, 1<<(lastC-1)) - // TODO @gbotrel last C restore. - msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } + // the last chunk may be processed with a different method than the rest, as it could be smaller. + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars) + for j := int(nbChunks - 2); j > 0; j-- { + go processChunk(uint64(j), chChunks[j], c, points, scalars) } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] + // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed + // in the ~same amount of time + if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + if !splitFirstChunk { + go processChunk(0, chChunks[0], c, points, scalars) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, chSplit, c, points[:split], scalars[:split]) + go processChunk(0, chSplit, c, points[split:], scalars[split:]) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } +// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { + var _p g1JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf // // This call return an error if len(scalars) != len(points) or if provided config is invalid. @@ -562,7 +344,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -607,7 +389,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time + // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 // we have nbSplits intermediate results that we must sum together. @@ -617,12 +399,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul start := i * nbPoints end := start + nbPoints go func(start, end, i int) { - msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) chDone <- i }(start, end, i) } - msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) for i := 0; i < nbSplits-1; i++ { done := <-chDone p.AddAssign(&_p[done]) @@ -631,82 +413,120 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - +func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { switch c { - case 1: - msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - case 4: - msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) case 5: - msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] + _innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 6: - msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 7: - msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 8: - msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] + _innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) case 9: - msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 10: - msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC10] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] + _innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 11: - msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC11] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] + _innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 12: - msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC12] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 13: - msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC13] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] + _innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 14: - msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC14] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 15: - msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC15] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] + _innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 16: - msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + processChunk := processChunkG2BatchAffine[bucketG2AffineC16] + _innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + case 20: + processChunk := processChunkG2BatchAffine[bucketG2AffineC20] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] + _innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) + case 21: + processChunk := processChunkG2BatchAffine[bucketG2AffineC21] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) + default: + panic("not implemented") + } +} - case 17: - msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac { - case 18: - msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } - case 19: - msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance - case 20: - msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } - case 21: - msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) + // the last chunk may be processed with a different method than the rest, as it could be smaller. + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) - case 22: - msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) + for j := int(nbChunks - 2); j > 0; j-- { + go processChunk(uint64(j), chChunks[j], c, points, scalars) + } - case 23: - msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) + // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] + // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed + // in the ~same amount of time + if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + if !splitFirstChunk { + go processChunk(0, chChunks[0], c, points, scalars) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, chSplit, c, points[:split], scalars[:split]) + go processChunk(0, chSplit, c, points[split:], scalars[split:]) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } - default: - panic("not implemented") } + + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp @@ -725,121 +545,139 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - scalars []fr.Element) { +// selector stores the index, mask and shifts needed to select bits from a scalar +// it is used during the multiExp algorithm or the batch scalar multiplication +type selector struct { + index uint64 // index in the multi-word scalar to select bits from + mask uint64 // mask (c-bit wide) + shift uint64 // shift needed to get our bits on low positions - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) + maskHigh uint64 // same than mask, for index+1 + shiftHigh uint64 // same than shift, for index+1 +} - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { + toReturn := make([]fr.Element, len(scalars)) - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) } + selectors[chunk] = d } - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int - chRes <- total + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.FitsOnOneWord() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } -} + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] -func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // init with carry if any + digit := carry + carry = 0 - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G2Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g2JacExtended, 1<<(lastC-1)) - // TODO @gbotrel last C restore. - msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars) - } + // if digit is zero, no impact on result + if digit == 0 { + continue + } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + var bits uint64 + if digit >= 0 { + bits = uint64(digit) + } else { + bits = uint64(-digit-1) | msbWindow + } - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) + toReturn[i][s.index] |= (bits << s.shift) + if s.multiWordSelect { + toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues } diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index c2d52847f3..f1cdcfe574 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -17,11 +17,7 @@ package bls24315 import ( - "errors" - "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls24-315/fr" - "math" - "runtime" ) const MAX_BATCH_SIZE = 600 @@ -34,320 +30,13 @@ func (o batchOp) isNeg() bool { return o.pointID&1 == 1 } -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// processChunkG1BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. // -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { - var _p G1Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G1Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 1: - msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - - case 4: - msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - - case 5: - msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - - case 8: - msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - - case 10: - batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - - case 11: - batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - - case 12: - batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - - case 13: - batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - - case 14: - batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - - case 15: - batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - - case 16: - batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - - case 18: - batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - - case 19: - batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - - case 20: - batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - - case 21: - batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - - case 22: - batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - - case 23: - batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } -} - -type BatchG1Affine[B ibG1Affine] struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G1Affine - buckets *B -} - -func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { - batchSize := len(*buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG1Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} - -func (b *BatchG1Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG1Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG1Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} - -func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -441,66 +130,8 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, } -func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g1JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G1Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g1JacExtended, 1<<(lastC-1)) - // TODO @gbotrel lastC restore. - msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, int(c), chChunks[:]) -} - -type bucketG1AffineC1 [1 << (1 - 1)]G1Affine -type bucketG1AffineC2 [1 << (2 - 1)]G1Affine -type bucketG1AffineC3 [1 << (3 - 1)]G1Affine +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack type bucketG1AffineC4 [1 << (4 - 1)]G1Affine type bucketG1AffineC5 [1 << (5 - 1)]G1Affine type bucketG1AffineC6 [1 << (6 - 1)]G1Affine @@ -514,42 +145,11 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine type bucketG1AffineC14 [1 << (14 - 1)]G1Affine type bucketG1AffineC15 [1 << (15 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine -type bucketG1AffineC17 [1 << (17 - 1)]G1Affine -type bucketG1AffineC18 [1 << (18 - 1)]G1Affine -type bucketG1AffineC19 [1 << (19 - 1)]G1Affine type bucketG1AffineC20 [1 << (20 - 1)]G1Affine type bucketG1AffineC21 [1 << (21 - 1)]G1Affine -type bucketG1AffineC22 [1 << (22 - 1)]G1Affine -type bucketG1AffineC23 [1 << (23 - 1)]G1Affine -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended -type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended -type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended -type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended -type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended -type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended -type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended -type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended -type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended -type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended -type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended -type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended -type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended -type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended -type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended -type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended -type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended -type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended -type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended type ibG1Affine interface { - bucketG1AffineC1 | - bucketG1AffineC2 | - bucketG1AffineC3 | - bucketG1AffineC4 | + bucketG1AffineC4 | bucketG1AffineC5 | bucketG1AffineC6 | bucketG1AffineC7 | @@ -562,258 +162,21 @@ type ibG1Affine interface { bucketG1AffineC14 | bucketG1AffineC15 | bucketG1AffineC16 | - bucketG1AffineC17 | - bucketG1AffineC18 | - bucketG1AffineC19 | bucketG1AffineC20 | - bucketG1AffineC21 | - bucketG1AffineC22 | - bucketG1AffineC23 -} - -type ibg1JacExtended interface { - bucketg1JacExtendedC1 | - bucketg1JacExtendedC2 | - bucketg1JacExtendedC3 | - bucketg1JacExtendedC4 | - bucketg1JacExtendedC5 | - bucketg1JacExtendedC6 | - bucketg1JacExtendedC7 | - bucketg1JacExtendedC8 | - bucketg1JacExtendedC9 | - bucketg1JacExtendedC10 | - bucketg1JacExtendedC11 | - bucketg1JacExtendedC12 | - bucketg1JacExtendedC13 | - bucketg1JacExtendedC14 | - bucketg1JacExtendedC15 | - bucketg1JacExtendedC16 | - bucketg1JacExtendedC17 | - bucketg1JacExtendedC18 | - bucketg1JacExtendedC19 | - bucketg1JacExtendedC20 | - bucketg1JacExtendedC21 | - bucketg1JacExtendedC22 | - bucketg1JacExtendedC23 -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 1: - msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - - case 4: - msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - - case 5: - msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - - case 8: - msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - - case 10: - batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - - case 11: - batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - - case 12: - batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - - case 13: - batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - - case 14: - batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - - case 15: - batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - - case 16: - batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - - case 18: - batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - - case 19: - batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - - case 20: - batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - - case 21: - batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - - case 22: - batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - - case 23: - batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } + bucketG1AffineC21 } -type BatchG2Affine[B ibG2Affine] struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine +type BatchG1Affine[B ibG1Affine] struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine batchSize int cptP int bucketIds map[uint32]struct{} - points []G2Affine + points []G1Affine buckets *B } -func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { +func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { batchSize := len(*buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -821,7 +184,7 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine if batchSize <= 0 { batchSize = 1 } - return BatchG2Affine[B]{ + return BatchG1Affine[B]{ buckets: buckets, points: points, batchSize: batchSize, @@ -829,18 +192,18 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine } } -func (b *BatchG2Affine[B]) IsFull() bool { +func (b *BatchG1Affine[B]) IsFull() bool { return b.cptP == b.batchSize } -func (b *BatchG2Affine[B]) ExecuteAndReset() { +func (b *BatchG1Affine[B]) ExecuteAndReset() { if b.cptP == 0 { return } // for i := 0; i < len(b.R); i++ { // b.R[i].Add(b.R[i], b.P[i]) // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) for k := range b.bucketIds { delete(b.bucketIds, k) } @@ -848,12 +211,12 @@ func (b *BatchG2Affine[B]) ExecuteAndReset() { b.cptP = 0 } -func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { +func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { _, ok := b.bucketIds[bID] return !ok } -func (b *BatchG2Affine[B]) Add(op batchOp) { +func (b *BatchG1Affine[B]) Add(op batchOp) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &(*b.buckets)[op.bucketID] @@ -895,7 +258,7 @@ func (b *BatchG2Affine[B]) Add(op batchOp) { b.cptP++ } -func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { +func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { for i := len(queue) - 1; i >= 0; i-- { if batch.CanAdd(queue[i].bucketID) { batch.Add(queue[i]) @@ -910,7 +273,13 @@ func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B] } -func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, +// processChunkG2BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. +// +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -1004,66 +373,8 @@ func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, } -func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G2Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g2JacExtended, 1<<(lastC-1)) - // TODO @gbotrel lastC restore. - msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) -} - -type bucketG2AffineC1 [1 << (1 - 1)]G2Affine -type bucketG2AffineC2 [1 << (2 - 1)]G2Affine -type bucketG2AffineC3 [1 << (3 - 1)]G2Affine +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack type bucketG2AffineC4 [1 << (4 - 1)]G2Affine type bucketG2AffineC5 [1 << (5 - 1)]G2Affine type bucketG2AffineC6 [1 << (6 - 1)]G2Affine @@ -1077,42 +388,11 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine type bucketG2AffineC14 [1 << (14 - 1)]G2Affine type bucketG2AffineC15 [1 << (15 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine -type bucketG2AffineC17 [1 << (17 - 1)]G2Affine -type bucketG2AffineC18 [1 << (18 - 1)]G2Affine -type bucketG2AffineC19 [1 << (19 - 1)]G2Affine type bucketG2AffineC20 [1 << (20 - 1)]G2Affine type bucketG2AffineC21 [1 << (21 - 1)]G2Affine -type bucketG2AffineC22 [1 << (22 - 1)]G2Affine -type bucketG2AffineC23 [1 << (23 - 1)]G2Affine -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended -type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended -type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended -type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended -type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended -type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended -type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended -type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended -type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended -type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended -type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended -type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended -type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended -type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended -type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended -type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended -type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended -type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended -type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended type ibG2Affine interface { - bucketG2AffineC1 | - bucketG2AffineC2 | - bucketG2AffineC3 | - bucketG2AffineC4 | + bucketG2AffineC4 | bucketG2AffineC5 | bucketG2AffineC6 | bucketG2AffineC7 | @@ -1125,37 +405,113 @@ type ibG2Affine interface { bucketG2AffineC14 | bucketG2AffineC15 | bucketG2AffineC16 | - bucketG2AffineC17 | - bucketG2AffineC18 | - bucketG2AffineC19 | bucketG2AffineC20 | - bucketG2AffineC21 | - bucketG2AffineC22 | - bucketG2AffineC23 + bucketG2AffineC21 +} + +type BatchG2Affine[B ibG2Affine] struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G2Affine + buckets *B } -type ibg2JacExtended interface { - bucketg2JacExtendedC1 | - bucketg2JacExtendedC2 | - bucketg2JacExtendedC3 | - bucketg2JacExtendedC4 | - bucketg2JacExtendedC5 | - bucketg2JacExtendedC6 | - bucketg2JacExtendedC7 | - bucketg2JacExtendedC8 | - bucketg2JacExtendedC9 | - bucketg2JacExtendedC10 | - bucketg2JacExtendedC11 | - bucketg2JacExtendedC12 | - bucketg2JacExtendedC13 | - bucketg2JacExtendedC14 | - bucketg2JacExtendedC15 | - bucketg2JacExtendedC16 | - bucketg2JacExtendedC17 | - bucketg2JacExtendedC18 | - bucketg2JacExtendedC19 | - bucketg2JacExtendedC20 | - bucketg2JacExtendedC21 | - bucketg2JacExtendedC22 | - bucketg2JacExtendedC23 +func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { + batchSize := len(*buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG2Affine[B]{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), + } +} + +func (b *BatchG2Affine[B]) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG2Affine[B]) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG2Affine[B]) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &(*b.buckets)[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(P) + } else { + BK.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(P) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = BK + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + } diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go new file mode 100644 index 0000000000..4399395829 --- /dev/null +++ b/ecc/bls24-315/multiexp_jacobian.go @@ -0,0 +1,229 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package bls24315 + +import ( + "github.com/consensys/gnark-crypto/ecc/bls24-315/fr" +) + +func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, + chRes chan<- g1JacExtended, + c uint64, + points []G1Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended +type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended +type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended +type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended +type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended +type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended +type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended +type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended +type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended +type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended +type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended +type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended +type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended + +type ibg1JacExtended interface { + bucketg1JacExtendedC1 | + bucketg1JacExtendedC3 | + bucketg1JacExtendedC4 | + bucketg1JacExtendedC5 | + bucketg1JacExtendedC6 | + bucketg1JacExtendedC7 | + bucketg1JacExtendedC8 | + bucketg1JacExtendedC9 | + bucketg1JacExtendedC10 | + bucketg1JacExtendedC11 | + bucketg1JacExtendedC12 | + bucketg1JacExtendedC13 | + bucketg1JacExtendedC14 | + bucketg1JacExtendedC15 | + bucketg1JacExtendedC16 | + bucketg1JacExtendedC20 | + bucketg1JacExtendedC21 +} + +func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended +type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended +type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended +type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended +type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended +type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended +type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended +type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended +type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended +type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended +type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended +type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended +type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended + +type ibg2JacExtended interface { + bucketg2JacExtendedC1 | + bucketg2JacExtendedC3 | + bucketg2JacExtendedC4 | + bucketg2JacExtendedC5 | + bucketg2JacExtendedC6 | + bucketg2JacExtendedC7 | + bucketg2JacExtendedC8 | + bucketg2JacExtendedC9 | + bucketg2JacExtendedC10 | + bucketg2JacExtendedC11 | + bucketg2JacExtendedC12 | + bucketg2JacExtendedC13 | + bucketg2JacExtendedC14 | + bucketg2JacExtendedC15 | + bucketg2JacExtendedC16 | + bucketg2JacExtendedC20 | + bucketg2JacExtendedC21 +} diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index a942e21132..6a17d03fb4 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) + innerMsmG1(&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} + cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -130,10 +130,10 @@ func TestMultiExpG1(t *testing.T) { results := make([]G1Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&results[i], int(c), samplePoints[:], scalars, false) + innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG1Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true) + innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -171,10 +171,10 @@ func TestMultiExpG1(t *testing.T) { results := make([]G1Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&results[i], int(c), samplePointsZero[:], scalars, false) + innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG1Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) + innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -209,8 +209,8 @@ func TestMultiExpG1(t *testing.T) { var result1, result2 G1Jac for _, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false) - msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + innerMsmG1(&result1, int(c), samplePoints[:], scalars, false) + innerMsmG1(&result2, int(c), samplePoints[:], scalars, false) if !result1.Equal(&result2) { return false } @@ -288,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) { b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { - testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) } @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) + innerMsmG2(&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -461,10 +461,10 @@ func TestMultiExpG2(t *testing.T) { results := make([]G2Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&results[i], int(c), samplePoints[:], scalars, false) + innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG2Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true) + innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -502,10 +502,10 @@ func TestMultiExpG2(t *testing.T) { results := make([]G2Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&results[i], int(c), samplePointsZero[:], scalars, false) + innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG2Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) + innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -540,8 +540,8 @@ func TestMultiExpG2(t *testing.T) { var result1, result2 G2Jac for _, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false) - msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + innerMsmG2(&result1, int(c), samplePoints[:], scalars, false) + innerMsmG2(&result2, int(c), samplePoints[:], scalars, false) if !result1.Equal(&result2) { return false } @@ -619,7 +619,7 @@ func BenchmarkMultiExpG2(b *testing.B) { b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { - testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) } diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index 1950ae3ef6..2cc4feb7fd 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -25,143 +25,6 @@ import ( "runtime" ) -// selector stores the index, mask and shifts needed to select bits from a scalar -// it is used during the multiExp algorithm or the batch scalar multiplication -type selector struct { - index uint64 // index in the multi-word scalar to select bits from - mask uint64 // mask (c-bit wide) - shift uint64 // shift needed to get our bits on low positions - - multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) - maskHigh uint64 // same than mask, for index+1 - shiftHigh uint64 // same than shift, for index+1 -} - -// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits -// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract -// 2^{c} to the current digit, making it negative. -// negative digits can be processed in a later step as adding -G into the bucket instead of G -// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) -// scalarsMont indicates wheter the provided scalars are in montgomery form -// returns smallValues, which represent the number of scalars which meets the following condition -// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { - toReturn := make([]fr.Element, len(scalars)) - - // number of c-bit radixes in a scalar - nbChunks := fr.Limbs * 64 / c - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window - max := int(1 << (c - 1)) // max value we want for our digits - cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words - - // compute offset and word selector / shift to select the right bits of our windows - selectors := make([]selector, nbChunks) - for chunk := uint64(0); chunk < nbChunks; chunk++ { - jc := uint64(chunk * c) - d := selector{} - d.index = jc / 64 - d.shift = jc - (d.index * 64) - d.mask = mask << d.shift - d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) - if d.multiWordSelect { - nbBitsHigh := d.shift - uint64(64-c) - d.maskHigh = (1 << nbBitsHigh) - 1 - d.shiftHigh = (c - nbBitsHigh) - } - selectors[chunk] = d - } - - // for each chunk, we could track the number of non-zeros points we will need to process - // this way, if a chunk has more work to do than others, we can spawn off more go routines - // (at the cost of more buckets allocated) - // a simplified approach is to track the small values where only the first word is set - // if this number represent a significant number of points, then we will split first chunk - // processing in the msm in 2, to ensure all go routines finish at ~same time - // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine - // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) - - parallel.Execute(len(scalars), func(start, end int) { - smallValues := 0 - for i := start; i < end; i++ { - var carry int - - scalar := scalars[i] - if scalarsMont { - scalar.FromMont() - } - if scalar.FitsOnOneWord() { - // everything is 0, no need to process this scalar - if scalar[0] == 0 { - continue - } - // low c-bits are 1 in mask - if scalar[0]&mask == scalar[0] { - smallValues++ - } - } - - // for each chunk in the scalar, compute the current digit, and an eventual carry - for chunk := uint64(0); chunk < nbChunks; chunk++ { - s := selectors[chunk] - - // init with carry if any - digit := carry - carry = 0 - - // digit = value of the c-bit window - digit += int((scalar[s.index] & s.mask) >> s.shift) - - if s.multiWordSelect { - // we are selecting bits over 2 words - digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh - } - - // if digit is zero, no impact on result - if digit == 0 { - continue - } - - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - if digit >= max { - digit -= (1 << c) - carry = 1 - } - - var bits uint64 - if digit >= 0 { - bits = uint64(digit) - } else { - bits = uint64(-digit-1) | msbWindow - } - - toReturn[i][s.index] |= (bits << s.shift) - if s.multiWordSelect { - toReturn[i][s.index+1] |= (bits >> s.shiftHigh) - } - - } - } - - chSmallValues <- smallValues - - }, nbTasks) - - // aggregate small values - close(chSmallValues) - smallValues := 0 - for o := range chSmallValues { - smallValues += o - } - return toReturn, smallValues -} - // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf // // This call return an error if len(scalars) != len(points) or if provided config is invalid. @@ -221,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -266,7 +129,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG1Jac , but that would incur a cost of looping through all scalars one more time + // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 // we have nbSplits intermediate results that we must sum together. @@ -276,12 +139,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul start := i * nbPoints end := start + nbPoints go func(start, end, i int) { - msmInnerG1Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) chDone <- i }(start, end, i) } - msmInnerG1Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) for i := 0; i < nbSplits-1; i++ { done := <-chDone p.AddAssign(&_p[done]) @@ -290,169 +153,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { - +func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { switch c { - case 1: - msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - case 4: - msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) case 5: - msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] + _innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 6: - msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 7: - msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 8: - msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] + _innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) case 9: - msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 10: - msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC10] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] + _innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 11: - msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC11] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] + _innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 12: - msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC12] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 13: - msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC13] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] + _innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 14: - msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC14] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 15: - msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC15] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] + _innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 16: - msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - - case 18: - msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - - case 19: - msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC16] + _innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) case 20: - msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC20] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] + _innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 21: - msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - - case 22: - msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - - case 23: - msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC21] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { - var _p g1JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64, - chRes chan<- g1JacExtended, - c uint64, - points []G1Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac { - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) - } - } - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g1JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { nbChunks++ } + // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is @@ -464,45 +237,54 @@ func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, poi chChunks[i] = make(chan g1JacExtended, 1) } - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G1Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g1JacExtended, 1<<(lastC-1)) - // TODO @gbotrel last C restore. - msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } + // the last chunk may be processed with a different method than the rest, as it could be smaller. + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars) + for j := int(nbChunks - 2); j > 0; j-- { + go processChunk(uint64(j), chChunks[j], c, points, scalars) } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] + // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed + // in the ~same amount of time + if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + if !splitFirstChunk { + go processChunk(0, chChunks[0], c, points, scalars) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, chSplit, c, points[:split], scalars[:split]) + go processChunk(0, chSplit, c, points[split:], scalars[split:]) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } +// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { + var _p g1JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf // // This call return an error if len(scalars) != len(points) or if provided config is invalid. @@ -562,7 +344,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -607,7 +389,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time + // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 // we have nbSplits intermediate results that we must sum together. @@ -617,12 +399,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul start := i * nbPoints end := start + nbPoints go func(start, end, i int) { - msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) chDone <- i }(start, end, i) } - msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) for i := 0; i < nbSplits-1; i++ { done := <-chDone p.AddAssign(&_p[done]) @@ -631,82 +413,120 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - +func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { switch c { - case 1: - msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - case 4: - msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) case 5: - msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] + _innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 6: - msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 7: - msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 8: - msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] + _innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) case 9: - msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 10: - msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC10] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] + _innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 11: - msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC11] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] + _innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 12: - msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC12] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 13: - msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC13] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] + _innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 14: - msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC14] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 15: - msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC15] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] + _innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 16: - msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + processChunk := processChunkG2BatchAffine[bucketG2AffineC16] + _innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + case 20: + processChunk := processChunkG2BatchAffine[bucketG2AffineC20] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] + _innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) + case 21: + processChunk := processChunkG2BatchAffine[bucketG2AffineC21] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) + default: + panic("not implemented") + } +} - case 17: - msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac { - case 18: - msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } - case 19: - msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance - case 20: - msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } - case 21: - msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) + // the last chunk may be processed with a different method than the rest, as it could be smaller. + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) - case 22: - msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) + for j := int(nbChunks - 2); j > 0; j-- { + go processChunk(uint64(j), chChunks[j], c, points, scalars) + } - case 23: - msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) + // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] + // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed + // in the ~same amount of time + if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + if !splitFirstChunk { + go processChunk(0, chChunks[0], c, points, scalars) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, chSplit, c, points[:split], scalars[:split]) + go processChunk(0, chSplit, c, points[split:], scalars[split:]) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } - default: - panic("not implemented") } + + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp @@ -725,121 +545,139 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - scalars []fr.Element) { +// selector stores the index, mask and shifts needed to select bits from a scalar +// it is used during the multiExp algorithm or the batch scalar multiplication +type selector struct { + index uint64 // index in the multi-word scalar to select bits from + mask uint64 // mask (c-bit wide) + shift uint64 // shift needed to get our bits on low positions - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) + maskHigh uint64 // same than mask, for index+1 + shiftHigh uint64 // same than shift, for index+1 +} - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { + toReturn := make([]fr.Element, len(scalars)) - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) } + selectors[chunk] = d } - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int - chRes <- total + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.FitsOnOneWord() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } -} + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] -func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // init with carry if any + digit := carry + carry = 0 - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G2Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g2JacExtended, 1<<(lastC-1)) - // TODO @gbotrel last C restore. - msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars) - } + // if digit is zero, no impact on result + if digit == 0 { + continue + } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + var bits uint64 + if digit >= 0 { + bits = uint64(digit) + } else { + bits = uint64(-digit-1) | msbWindow + } - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) + toReturn[i][s.index] |= (bits << s.shift) + if s.multiWordSelect { + toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues } diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index 1965405349..916d8beced 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -17,11 +17,7 @@ package bls24317 import ( - "errors" - "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls24-317/fr" - "math" - "runtime" ) const MAX_BATCH_SIZE = 600 @@ -34,320 +30,13 @@ func (o batchOp) isNeg() bool { return o.pointID&1 == 1 } -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// processChunkG1BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. // -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { - var _p G1Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G1Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 1: - msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - - case 4: - msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - - case 5: - msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - - case 8: - msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - - case 10: - batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - - case 11: - batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - - case 12: - batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - - case 13: - batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - - case 14: - batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - - case 15: - batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - - case 16: - batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - - case 18: - batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - - case 19: - batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - - case 20: - batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - - case 21: - batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - - case 22: - batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - - case 23: - batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } -} - -type BatchG1Affine[B ibG1Affine] struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G1Affine - buckets *B -} - -func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { - batchSize := len(*buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG1Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} - -func (b *BatchG1Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG1Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG1Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} - -func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -441,66 +130,8 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, } -func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g1JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G1Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g1JacExtended, 1<<(lastC-1)) - // TODO @gbotrel lastC restore. - msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, int(c), chChunks[:]) -} - -type bucketG1AffineC1 [1 << (1 - 1)]G1Affine -type bucketG1AffineC2 [1 << (2 - 1)]G1Affine -type bucketG1AffineC3 [1 << (3 - 1)]G1Affine +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack type bucketG1AffineC4 [1 << (4 - 1)]G1Affine type bucketG1AffineC5 [1 << (5 - 1)]G1Affine type bucketG1AffineC6 [1 << (6 - 1)]G1Affine @@ -514,42 +145,11 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine type bucketG1AffineC14 [1 << (14 - 1)]G1Affine type bucketG1AffineC15 [1 << (15 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine -type bucketG1AffineC17 [1 << (17 - 1)]G1Affine -type bucketG1AffineC18 [1 << (18 - 1)]G1Affine -type bucketG1AffineC19 [1 << (19 - 1)]G1Affine type bucketG1AffineC20 [1 << (20 - 1)]G1Affine type bucketG1AffineC21 [1 << (21 - 1)]G1Affine -type bucketG1AffineC22 [1 << (22 - 1)]G1Affine -type bucketG1AffineC23 [1 << (23 - 1)]G1Affine -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended -type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended -type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended -type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended -type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended -type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended -type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended -type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended -type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended -type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended -type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended -type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended -type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended -type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended -type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended -type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended -type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended -type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended -type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended type ibG1Affine interface { - bucketG1AffineC1 | - bucketG1AffineC2 | - bucketG1AffineC3 | - bucketG1AffineC4 | + bucketG1AffineC4 | bucketG1AffineC5 | bucketG1AffineC6 | bucketG1AffineC7 | @@ -562,258 +162,21 @@ type ibG1Affine interface { bucketG1AffineC14 | bucketG1AffineC15 | bucketG1AffineC16 | - bucketG1AffineC17 | - bucketG1AffineC18 | - bucketG1AffineC19 | bucketG1AffineC20 | - bucketG1AffineC21 | - bucketG1AffineC22 | - bucketG1AffineC23 -} - -type ibg1JacExtended interface { - bucketg1JacExtendedC1 | - bucketg1JacExtendedC2 | - bucketg1JacExtendedC3 | - bucketg1JacExtendedC4 | - bucketg1JacExtendedC5 | - bucketg1JacExtendedC6 | - bucketg1JacExtendedC7 | - bucketg1JacExtendedC8 | - bucketg1JacExtendedC9 | - bucketg1JacExtendedC10 | - bucketg1JacExtendedC11 | - bucketg1JacExtendedC12 | - bucketg1JacExtendedC13 | - bucketg1JacExtendedC14 | - bucketg1JacExtendedC15 | - bucketg1JacExtendedC16 | - bucketg1JacExtendedC17 | - bucketg1JacExtendedC18 | - bucketg1JacExtendedC19 | - bucketg1JacExtendedC20 | - bucketg1JacExtendedC21 | - bucketg1JacExtendedC22 | - bucketg1JacExtendedC23 -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 1: - msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - - case 4: - msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - - case 5: - msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - - case 8: - msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - - case 10: - batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - - case 11: - batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - - case 12: - batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - - case 13: - batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - - case 14: - batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - - case 15: - batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - - case 16: - batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - - case 18: - batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - - case 19: - batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - - case 20: - batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - - case 21: - batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - - case 22: - batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - - case 23: - batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } + bucketG1AffineC21 } -type BatchG2Affine[B ibG2Affine] struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine +type BatchG1Affine[B ibG1Affine] struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine batchSize int cptP int bucketIds map[uint32]struct{} - points []G2Affine + points []G1Affine buckets *B } -func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { +func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { batchSize := len(*buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -821,7 +184,7 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine if batchSize <= 0 { batchSize = 1 } - return BatchG2Affine[B]{ + return BatchG1Affine[B]{ buckets: buckets, points: points, batchSize: batchSize, @@ -829,18 +192,18 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine } } -func (b *BatchG2Affine[B]) IsFull() bool { +func (b *BatchG1Affine[B]) IsFull() bool { return b.cptP == b.batchSize } -func (b *BatchG2Affine[B]) ExecuteAndReset() { +func (b *BatchG1Affine[B]) ExecuteAndReset() { if b.cptP == 0 { return } // for i := 0; i < len(b.R); i++ { // b.R[i].Add(b.R[i], b.P[i]) // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) for k := range b.bucketIds { delete(b.bucketIds, k) } @@ -848,12 +211,12 @@ func (b *BatchG2Affine[B]) ExecuteAndReset() { b.cptP = 0 } -func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { +func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { _, ok := b.bucketIds[bID] return !ok } -func (b *BatchG2Affine[B]) Add(op batchOp) { +func (b *BatchG1Affine[B]) Add(op batchOp) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &(*b.buckets)[op.bucketID] @@ -895,7 +258,7 @@ func (b *BatchG2Affine[B]) Add(op batchOp) { b.cptP++ } -func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { +func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { for i := len(queue) - 1; i >= 0; i-- { if batch.CanAdd(queue[i].bucketID) { batch.Add(queue[i]) @@ -910,7 +273,13 @@ func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B] } -func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, +// processChunkG2BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. +// +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -1004,66 +373,8 @@ func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, } -func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G2Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g2JacExtended, 1<<(lastC-1)) - // TODO @gbotrel lastC restore. - msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) -} - -type bucketG2AffineC1 [1 << (1 - 1)]G2Affine -type bucketG2AffineC2 [1 << (2 - 1)]G2Affine -type bucketG2AffineC3 [1 << (3 - 1)]G2Affine +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack type bucketG2AffineC4 [1 << (4 - 1)]G2Affine type bucketG2AffineC5 [1 << (5 - 1)]G2Affine type bucketG2AffineC6 [1 << (6 - 1)]G2Affine @@ -1077,42 +388,11 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine type bucketG2AffineC14 [1 << (14 - 1)]G2Affine type bucketG2AffineC15 [1 << (15 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine -type bucketG2AffineC17 [1 << (17 - 1)]G2Affine -type bucketG2AffineC18 [1 << (18 - 1)]G2Affine -type bucketG2AffineC19 [1 << (19 - 1)]G2Affine type bucketG2AffineC20 [1 << (20 - 1)]G2Affine type bucketG2AffineC21 [1 << (21 - 1)]G2Affine -type bucketG2AffineC22 [1 << (22 - 1)]G2Affine -type bucketG2AffineC23 [1 << (23 - 1)]G2Affine -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended -type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended -type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended -type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended -type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended -type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended -type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended -type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended -type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended -type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended -type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended -type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended -type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended -type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended -type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended -type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended -type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended -type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended -type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended type ibG2Affine interface { - bucketG2AffineC1 | - bucketG2AffineC2 | - bucketG2AffineC3 | - bucketG2AffineC4 | + bucketG2AffineC4 | bucketG2AffineC5 | bucketG2AffineC6 | bucketG2AffineC7 | @@ -1125,37 +405,113 @@ type ibG2Affine interface { bucketG2AffineC14 | bucketG2AffineC15 | bucketG2AffineC16 | - bucketG2AffineC17 | - bucketG2AffineC18 | - bucketG2AffineC19 | bucketG2AffineC20 | - bucketG2AffineC21 | - bucketG2AffineC22 | - bucketG2AffineC23 + bucketG2AffineC21 +} + +type BatchG2Affine[B ibG2Affine] struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G2Affine + buckets *B } -type ibg2JacExtended interface { - bucketg2JacExtendedC1 | - bucketg2JacExtendedC2 | - bucketg2JacExtendedC3 | - bucketg2JacExtendedC4 | - bucketg2JacExtendedC5 | - bucketg2JacExtendedC6 | - bucketg2JacExtendedC7 | - bucketg2JacExtendedC8 | - bucketg2JacExtendedC9 | - bucketg2JacExtendedC10 | - bucketg2JacExtendedC11 | - bucketg2JacExtendedC12 | - bucketg2JacExtendedC13 | - bucketg2JacExtendedC14 | - bucketg2JacExtendedC15 | - bucketg2JacExtendedC16 | - bucketg2JacExtendedC17 | - bucketg2JacExtendedC18 | - bucketg2JacExtendedC19 | - bucketg2JacExtendedC20 | - bucketg2JacExtendedC21 | - bucketg2JacExtendedC22 | - bucketg2JacExtendedC23 +func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { + batchSize := len(*buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG2Affine[B]{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), + } +} + +func (b *BatchG2Affine[B]) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG2Affine[B]) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG2Affine[B]) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &(*b.buckets)[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(P) + } else { + BK.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(P) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = BK + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + } diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go new file mode 100644 index 0000000000..d948e2c697 --- /dev/null +++ b/ecc/bls24-317/multiexp_jacobian.go @@ -0,0 +1,229 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package bls24317 + +import ( + "github.com/consensys/gnark-crypto/ecc/bls24-317/fr" +) + +func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, + chRes chan<- g1JacExtended, + c uint64, + points []G1Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended +type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended +type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended +type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended +type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended +type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended +type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended +type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended +type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended +type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended +type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended +type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended +type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended + +type ibg1JacExtended interface { + bucketg1JacExtendedC1 | + bucketg1JacExtendedC3 | + bucketg1JacExtendedC4 | + bucketg1JacExtendedC5 | + bucketg1JacExtendedC6 | + bucketg1JacExtendedC7 | + bucketg1JacExtendedC8 | + bucketg1JacExtendedC9 | + bucketg1JacExtendedC10 | + bucketg1JacExtendedC11 | + bucketg1JacExtendedC12 | + bucketg1JacExtendedC13 | + bucketg1JacExtendedC14 | + bucketg1JacExtendedC15 | + bucketg1JacExtendedC16 | + bucketg1JacExtendedC20 | + bucketg1JacExtendedC21 +} + +func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended +type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended +type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended +type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended +type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended +type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended +type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended +type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended +type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended +type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended +type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended +type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended +type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended + +type ibg2JacExtended interface { + bucketg2JacExtendedC1 | + bucketg2JacExtendedC3 | + bucketg2JacExtendedC4 | + bucketg2JacExtendedC5 | + bucketg2JacExtendedC6 | + bucketg2JacExtendedC7 | + bucketg2JacExtendedC8 | + bucketg2JacExtendedC9 | + bucketg2JacExtendedC10 | + bucketg2JacExtendedC11 | + bucketg2JacExtendedC12 | + bucketg2JacExtendedC13 | + bucketg2JacExtendedC14 | + bucketg2JacExtendedC15 | + bucketg2JacExtendedC16 | + bucketg2JacExtendedC20 | + bucketg2JacExtendedC21 +} diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index 293f3ec6ef..7e39930e23 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) + innerMsmG1(&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} + cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -130,10 +130,10 @@ func TestMultiExpG1(t *testing.T) { results := make([]G1Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&results[i], int(c), samplePoints[:], scalars, false) + innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG1Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true) + innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -171,10 +171,10 @@ func TestMultiExpG1(t *testing.T) { results := make([]G1Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&results[i], int(c), samplePointsZero[:], scalars, false) + innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG1Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) + innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -209,8 +209,8 @@ func TestMultiExpG1(t *testing.T) { var result1, result2 G1Jac for _, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false) - msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + innerMsmG1(&result1, int(c), samplePoints[:], scalars, false) + innerMsmG1(&result2, int(c), samplePoints[:], scalars, false) if !result1.Equal(&result2) { return false } @@ -288,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) { b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { - testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) } @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) + innerMsmG2(&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -461,10 +461,10 @@ func TestMultiExpG2(t *testing.T) { results := make([]G2Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&results[i], int(c), samplePoints[:], scalars, false) + innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG2Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true) + innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -502,10 +502,10 @@ func TestMultiExpG2(t *testing.T) { results := make([]G2Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&results[i], int(c), samplePointsZero[:], scalars, false) + innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG2Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) + innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -540,8 +540,8 @@ func TestMultiExpG2(t *testing.T) { var result1, result2 G2Jac for _, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false) - msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + innerMsmG2(&result1, int(c), samplePoints[:], scalars, false) + innerMsmG2(&result2, int(c), samplePoints[:], scalars, false) if !result1.Equal(&result2) { return false } @@ -619,7 +619,7 @@ func BenchmarkMultiExpG2(b *testing.B) { b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { - testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) } diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index c2ecab3d61..e519895eb5 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -25,143 +25,6 @@ import ( "runtime" ) -// selector stores the index, mask and shifts needed to select bits from a scalar -// it is used during the multiExp algorithm or the batch scalar multiplication -type selector struct { - index uint64 // index in the multi-word scalar to select bits from - mask uint64 // mask (c-bit wide) - shift uint64 // shift needed to get our bits on low positions - - multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) - maskHigh uint64 // same than mask, for index+1 - shiftHigh uint64 // same than shift, for index+1 -} - -// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits -// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract -// 2^{c} to the current digit, making it negative. -// negative digits can be processed in a later step as adding -G into the bucket instead of G -// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) -// scalarsMont indicates wheter the provided scalars are in montgomery form -// returns smallValues, which represent the number of scalars which meets the following condition -// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { - toReturn := make([]fr.Element, len(scalars)) - - // number of c-bit radixes in a scalar - nbChunks := fr.Limbs * 64 / c - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window - max := int(1 << (c - 1)) // max value we want for our digits - cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words - - // compute offset and word selector / shift to select the right bits of our windows - selectors := make([]selector, nbChunks) - for chunk := uint64(0); chunk < nbChunks; chunk++ { - jc := uint64(chunk * c) - d := selector{} - d.index = jc / 64 - d.shift = jc - (d.index * 64) - d.mask = mask << d.shift - d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) - if d.multiWordSelect { - nbBitsHigh := d.shift - uint64(64-c) - d.maskHigh = (1 << nbBitsHigh) - 1 - d.shiftHigh = (c - nbBitsHigh) - } - selectors[chunk] = d - } - - // for each chunk, we could track the number of non-zeros points we will need to process - // this way, if a chunk has more work to do than others, we can spawn off more go routines - // (at the cost of more buckets allocated) - // a simplified approach is to track the small values where only the first word is set - // if this number represent a significant number of points, then we will split first chunk - // processing in the msm in 2, to ensure all go routines finish at ~same time - // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine - // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) - - parallel.Execute(len(scalars), func(start, end int) { - smallValues := 0 - for i := start; i < end; i++ { - var carry int - - scalar := scalars[i] - if scalarsMont { - scalar.FromMont() - } - if scalar.FitsOnOneWord() { - // everything is 0, no need to process this scalar - if scalar[0] == 0 { - continue - } - // low c-bits are 1 in mask - if scalar[0]&mask == scalar[0] { - smallValues++ - } - } - - // for each chunk in the scalar, compute the current digit, and an eventual carry - for chunk := uint64(0); chunk < nbChunks; chunk++ { - s := selectors[chunk] - - // init with carry if any - digit := carry - carry = 0 - - // digit = value of the c-bit window - digit += int((scalar[s.index] & s.mask) >> s.shift) - - if s.multiWordSelect { - // we are selecting bits over 2 words - digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh - } - - // if digit is zero, no impact on result - if digit == 0 { - continue - } - - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - if digit >= max { - digit -= (1 << c) - carry = 1 - } - - var bits uint64 - if digit >= 0 { - bits = uint64(digit) - } else { - bits = uint64(-digit-1) | msbWindow - } - - toReturn[i][s.index] |= (bits << s.shift) - if s.multiWordSelect { - toReturn[i][s.index+1] |= (bits >> s.shiftHigh) - } - - } - } - - chSmallValues <- smallValues - - }, nbTasks) - - // aggregate small values - close(chSmallValues) - smallValues := 0 - for o := range chSmallValues { - smallValues += o - } - return toReturn, smallValues -} - // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf // // This call return an error if len(scalars) != len(points) or if provided config is invalid. @@ -221,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -266,7 +129,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG1Jac , but that would incur a cost of looping through all scalars one more time + // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 // we have nbSplits intermediate results that we must sum together. @@ -276,12 +139,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul start := i * nbPoints end := start + nbPoints go func(start, end, i int) { - msmInnerG1Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) chDone <- i }(start, end, i) } - msmInnerG1Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) for i := 0; i < nbSplits-1; i++ { done := <-chDone p.AddAssign(&_p[done]) @@ -290,169 +153,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { - +func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { switch c { - case 1: - msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - case 4: - msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) case 5: - msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] + _innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 6: - msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 7: - msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 8: - msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] + _innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) case 9: - msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 10: - msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC10] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] + _innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 11: - msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC11] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] + _innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 12: - msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC12] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 13: - msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC13] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] + _innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 14: - msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC14] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 15: - msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC15] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] + _innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 16: - msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - - case 18: - msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - - case 19: - msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC16] + _innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) case 20: - msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC20] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] + _innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 21: - msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - - case 22: - msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - - case 23: - msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC21] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { - var _p g1JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64, - chRes chan<- g1JacExtended, - c uint64, - points []G1Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac { - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) - } - } - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g1JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { nbChunks++ } + // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is @@ -464,45 +237,54 @@ func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, poi chChunks[i] = make(chan g1JacExtended, 1) } - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G1Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g1JacExtended, 1<<(lastC-1)) - // TODO @gbotrel last C restore. - msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } + // the last chunk may be processed with a different method than the rest, as it could be smaller. + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars) + for j := int(nbChunks - 2); j > 0; j-- { + go processChunk(uint64(j), chChunks[j], c, points, scalars) } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] + // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed + // in the ~same amount of time + if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + if !splitFirstChunk { + go processChunk(0, chChunks[0], c, points, scalars) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, chSplit, c, points[:split], scalars[:split]) + go processChunk(0, chSplit, c, points[split:], scalars[split:]) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } +// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { + var _p g1JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf // // This call return an error if len(scalars) != len(points) or if provided config is invalid. @@ -562,7 +344,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -607,7 +389,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time + // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 // we have nbSplits intermediate results that we must sum together. @@ -617,12 +399,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul start := i * nbPoints end := start + nbPoints go func(start, end, i int) { - msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) chDone <- i }(start, end, i) } - msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) for i := 0; i < nbSplits-1; i++ { done := <-chDone p.AddAssign(&_p[done]) @@ -631,82 +413,120 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - +func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { switch c { - case 1: - msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - case 4: - msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) case 5: - msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] + _innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 6: - msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 7: - msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 8: - msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] + _innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) case 9: - msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 10: - msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC10] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] + _innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 11: - msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC11] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] + _innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 12: - msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC12] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 13: - msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC13] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] + _innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 14: - msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC14] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 15: - msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - + processChunk := processChunkG2BatchAffine[bucketG2AffineC15] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] + _innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 16: - msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + processChunk := processChunkG2BatchAffine[bucketG2AffineC16] + _innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + case 20: + processChunk := processChunkG2BatchAffine[bucketG2AffineC20] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] + _innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) + case 21: + processChunk := processChunkG2BatchAffine[bucketG2AffineC21] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) + default: + panic("not implemented") + } +} - case 17: - msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac { - case 18: - msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } - case 19: - msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance - case 20: - msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } - case 21: - msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) + // the last chunk may be processed with a different method than the rest, as it could be smaller. + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) - case 22: - msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) + for j := int(nbChunks - 2); j > 0; j-- { + go processChunk(uint64(j), chChunks[j], c, points, scalars) + } - case 23: - msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) + // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] + // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed + // in the ~same amount of time + if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + if !splitFirstChunk { + go processChunk(0, chChunks[0], c, points, scalars) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, chSplit, c, points[:split], scalars[:split]) + go processChunk(0, chSplit, c, points[split:], scalars[split:]) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } - default: - panic("not implemented") } + + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp @@ -725,121 +545,139 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - scalars []fr.Element) { +// selector stores the index, mask and shifts needed to select bits from a scalar +// it is used during the multiExp algorithm or the batch scalar multiplication +type selector struct { + index uint64 // index in the multi-word scalar to select bits from + mask uint64 // mask (c-bit wide) + shift uint64 // shift needed to get our bits on low positions - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) + maskHigh uint64 // same than mask, for index+1 + shiftHigh uint64 // same than shift, for index+1 +} - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { + toReturn := make([]fr.Element, len(scalars)) - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) } + selectors[chunk] = d } - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int - chRes <- total + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.FitsOnOneWord() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } -} + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] -func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // init with carry if any + digit := carry + carry = 0 - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G2Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g2JacExtended, 1<<(lastC-1)) - // TODO @gbotrel last C restore. - msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars) - } + // if digit is zero, no impact on result + if digit == 0 { + continue + } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + var bits uint64 + if digit >= 0 { + bits = uint64(digit) + } else { + bits = uint64(-digit-1) | msbWindow + } - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) + toReturn[i][s.index] |= (bits << s.shift) + if s.multiWordSelect { + toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues } diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index 89db40edae..8b10c9786f 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -17,11 +17,7 @@ package bn254 import ( - "errors" - "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bn254/fr" - "math" - "runtime" ) const MAX_BATCH_SIZE = 600 @@ -34,320 +30,13 @@ func (o batchOp) isNeg() bool { return o.pointID&1 == 1 } -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// processChunkG1BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. // -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { - var _p G1Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G1Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 1: - msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - - case 4: - msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - - case 5: - msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - - case 8: - msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - - case 10: - batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - - case 11: - batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - - case 12: - batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - - case 13: - batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - - case 14: - batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - - case 15: - batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - - case 16: - batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - - case 18: - batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - - case 19: - batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - - case 20: - batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - - case 21: - batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - - case 22: - batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - - case 23: - batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } -} - -type BatchG1Affine[B ibG1Affine] struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G1Affine - buckets *B -} - -func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { - batchSize := len(*buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG1Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} - -func (b *BatchG1Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG1Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG1Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} - -func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -441,66 +130,8 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, } -func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g1JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G1Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g1JacExtended, 1<<(lastC-1)) - // TODO @gbotrel lastC restore. - msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, int(c), chChunks[:]) -} - -type bucketG1AffineC1 [1 << (1 - 1)]G1Affine -type bucketG1AffineC2 [1 << (2 - 1)]G1Affine -type bucketG1AffineC3 [1 << (3 - 1)]G1Affine +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack type bucketG1AffineC4 [1 << (4 - 1)]G1Affine type bucketG1AffineC5 [1 << (5 - 1)]G1Affine type bucketG1AffineC6 [1 << (6 - 1)]G1Affine @@ -514,42 +145,11 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine type bucketG1AffineC14 [1 << (14 - 1)]G1Affine type bucketG1AffineC15 [1 << (15 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine -type bucketG1AffineC17 [1 << (17 - 1)]G1Affine -type bucketG1AffineC18 [1 << (18 - 1)]G1Affine -type bucketG1AffineC19 [1 << (19 - 1)]G1Affine type bucketG1AffineC20 [1 << (20 - 1)]G1Affine type bucketG1AffineC21 [1 << (21 - 1)]G1Affine -type bucketG1AffineC22 [1 << (22 - 1)]G1Affine -type bucketG1AffineC23 [1 << (23 - 1)]G1Affine -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended -type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended -type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended -type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended -type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended -type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended -type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended -type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended -type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended -type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended -type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended -type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended -type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended -type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended -type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended -type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended -type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended -type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended -type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended type ibG1Affine interface { - bucketG1AffineC1 | - bucketG1AffineC2 | - bucketG1AffineC3 | - bucketG1AffineC4 | + bucketG1AffineC4 | bucketG1AffineC5 | bucketG1AffineC6 | bucketG1AffineC7 | @@ -562,258 +162,21 @@ type ibG1Affine interface { bucketG1AffineC14 | bucketG1AffineC15 | bucketG1AffineC16 | - bucketG1AffineC17 | - bucketG1AffineC18 | - bucketG1AffineC19 | bucketG1AffineC20 | - bucketG1AffineC21 | - bucketG1AffineC22 | - bucketG1AffineC23 -} - -type ibg1JacExtended interface { - bucketg1JacExtendedC1 | - bucketg1JacExtendedC2 | - bucketg1JacExtendedC3 | - bucketg1JacExtendedC4 | - bucketg1JacExtendedC5 | - bucketg1JacExtendedC6 | - bucketg1JacExtendedC7 | - bucketg1JacExtendedC8 | - bucketg1JacExtendedC9 | - bucketg1JacExtendedC10 | - bucketg1JacExtendedC11 | - bucketg1JacExtendedC12 | - bucketg1JacExtendedC13 | - bucketg1JacExtendedC14 | - bucketg1JacExtendedC15 | - bucketg1JacExtendedC16 | - bucketg1JacExtendedC17 | - bucketg1JacExtendedC18 | - bucketg1JacExtendedC19 | - bucketg1JacExtendedC20 | - bucketg1JacExtendedC21 | - bucketg1JacExtendedC22 | - bucketg1JacExtendedC23 -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 1: - msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk) - - case 4: - msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - - case 5: - msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk) - - case 8: - msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk) - - case 10: - batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk) - - case 11: - batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk) - - case 12: - batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk) - - case 13: - batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk) - - case 14: - batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk) - - case 15: - batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk) - - case 16: - batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk) - - case 18: - batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk) - - case 19: - batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk) - - case 20: - batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk) - - case 21: - batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk) - - case 22: - batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk) - - case 23: - batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk) - - default: - panic("not implemented") - } + bucketG1AffineC21 } -type BatchG2Affine[B ibG2Affine] struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine +type BatchG1Affine[B ibG1Affine] struct { + P [MAX_BATCH_SIZE]G1Affine + R [MAX_BATCH_SIZE]*G1Affine batchSize int cptP int bucketIds map[uint32]struct{} - points []G2Affine + points []G1Affine buckets *B } -func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { +func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { batchSize := len(*buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -821,7 +184,7 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine if batchSize <= 0 { batchSize = 1 } - return BatchG2Affine[B]{ + return BatchG1Affine[B]{ buckets: buckets, points: points, batchSize: batchSize, @@ -829,18 +192,18 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine } } -func (b *BatchG2Affine[B]) IsFull() bool { +func (b *BatchG1Affine[B]) IsFull() bool { return b.cptP == b.batchSize } -func (b *BatchG2Affine[B]) ExecuteAndReset() { +func (b *BatchG1Affine[B]) ExecuteAndReset() { if b.cptP == 0 { return } // for i := 0; i < len(b.R); i++ { // b.R[i].Add(b.R[i], b.P[i]) // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) for k := range b.bucketIds { delete(b.bucketIds, k) } @@ -848,12 +211,12 @@ func (b *BatchG2Affine[B]) ExecuteAndReset() { b.cptP = 0 } -func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { +func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { _, ok := b.bucketIds[bID] return !ok } -func (b *BatchG2Affine[B]) Add(op batchOp) { +func (b *BatchG1Affine[B]) Add(op batchOp) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &(*b.buckets)[op.bucketID] @@ -895,7 +258,7 @@ func (b *BatchG2Affine[B]) Add(op batchOp) { b.cptP++ } -func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { +func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { for i := len(queue) - 1; i >= 0; i-- { if batch.CanAdd(queue[i].bucketID) { batch.Add(queue[i]) @@ -910,7 +273,13 @@ func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B] } -func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, +// processChunkG2BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. +// +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -1004,66 +373,8 @@ func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, } -func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G2Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g2JacExtended, 1<<(lastC-1)) - // TODO @gbotrel lastC restore. - msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) -} - -type bucketG2AffineC1 [1 << (1 - 1)]G2Affine -type bucketG2AffineC2 [1 << (2 - 1)]G2Affine -type bucketG2AffineC3 [1 << (3 - 1)]G2Affine +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack type bucketG2AffineC4 [1 << (4 - 1)]G2Affine type bucketG2AffineC5 [1 << (5 - 1)]G2Affine type bucketG2AffineC6 [1 << (6 - 1)]G2Affine @@ -1077,42 +388,11 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine type bucketG2AffineC14 [1 << (14 - 1)]G2Affine type bucketG2AffineC15 [1 << (15 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine -type bucketG2AffineC17 [1 << (17 - 1)]G2Affine -type bucketG2AffineC18 [1 << (18 - 1)]G2Affine -type bucketG2AffineC19 [1 << (19 - 1)]G2Affine type bucketG2AffineC20 [1 << (20 - 1)]G2Affine type bucketG2AffineC21 [1 << (21 - 1)]G2Affine -type bucketG2AffineC22 [1 << (22 - 1)]G2Affine -type bucketG2AffineC23 [1 << (23 - 1)]G2Affine -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended -type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended -type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended -type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended -type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended -type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended -type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended -type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended -type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended -type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended -type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended -type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended -type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended -type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended -type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended -type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended -type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended -type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended -type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended type ibG2Affine interface { - bucketG2AffineC1 | - bucketG2AffineC2 | - bucketG2AffineC3 | - bucketG2AffineC4 | + bucketG2AffineC4 | bucketG2AffineC5 | bucketG2AffineC6 | bucketG2AffineC7 | @@ -1125,37 +405,113 @@ type ibG2Affine interface { bucketG2AffineC14 | bucketG2AffineC15 | bucketG2AffineC16 | - bucketG2AffineC17 | - bucketG2AffineC18 | - bucketG2AffineC19 | bucketG2AffineC20 | - bucketG2AffineC21 | - bucketG2AffineC22 | - bucketG2AffineC23 + bucketG2AffineC21 +} + +type BatchG2Affine[B ibG2Affine] struct { + P [MAX_BATCH_SIZE]G2Affine + R [MAX_BATCH_SIZE]*G2Affine + batchSize int + cptP int + bucketIds map[uint32]struct{} + points []G2Affine + buckets *B } -type ibg2JacExtended interface { - bucketg2JacExtendedC1 | - bucketg2JacExtendedC2 | - bucketg2JacExtendedC3 | - bucketg2JacExtendedC4 | - bucketg2JacExtendedC5 | - bucketg2JacExtendedC6 | - bucketg2JacExtendedC7 | - bucketg2JacExtendedC8 | - bucketg2JacExtendedC9 | - bucketg2JacExtendedC10 | - bucketg2JacExtendedC11 | - bucketg2JacExtendedC12 | - bucketg2JacExtendedC13 | - bucketg2JacExtendedC14 | - bucketg2JacExtendedC15 | - bucketg2JacExtendedC16 | - bucketg2JacExtendedC17 | - bucketg2JacExtendedC18 | - bucketg2JacExtendedC19 | - bucketg2JacExtendedC20 | - bucketg2JacExtendedC21 | - bucketg2JacExtendedC22 | - bucketg2JacExtendedC23 +func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { + batchSize := len(*buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + return BatchG2Affine[B]{ + buckets: buckets, + points: points, + batchSize: batchSize, + bucketIds: make(map[uint32]struct{}, len(*buckets)/2), + } +} + +func (b *BatchG2Affine[B]) IsFull() bool { + return b.cptP == b.batchSize +} + +func (b *BatchG2Affine[B]) ExecuteAndReset() { + if b.cptP == 0 { + return + } + // for i := 0; i < len(b.R); i++ { + // b.R[i].Add(b.R[i], b.P[i]) + // } + BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) + for k := range b.bucketIds { + delete(b.bucketIds, k) + } + // b.bucketIds = [MAX_BATCH_SIZE]uint32{} + b.cptP = 0 +} + +func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { + _, ok := b.bucketIds[bID] + return !ok +} + +func (b *BatchG2Affine[B]) Add(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &(*b.buckets)[op.bucketID] + P := &b.points[op.pointID>>1] + if P.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(P) + } else { + BK.Set(P) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(P) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { + BK.setInfinity() + return + } + } + + // b.bucketIds[b.cptP] = op.bucketID + b.bucketIds[op.bucketID] = struct{}{} + b.R[b.cptP] = BK + if op.isNeg() { + b.P[b.cptP].Neg(P) + } else { + b.P[b.cptP].Set(P) + } + b.cptP++ +} + +func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { + for i := len(queue) - 1; i >= 0; i-- { + if batch.CanAdd(queue[i].bucketID) { + batch.Add(queue[i]) + if batch.IsFull() { + batch.ExecuteAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + } + } + return queue + } diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go new file mode 100644 index 0000000000..4939af44c8 --- /dev/null +++ b/ecc/bn254/multiexp_jacobian.go @@ -0,0 +1,229 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package bn254 + +import ( + "github.com/consensys/gnark-crypto/ecc/bn254/fr" +) + +func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, + chRes chan<- g1JacExtended, + c uint64, + points []G1Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended +type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended +type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended +type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended +type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended +type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended +type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended +type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended +type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended +type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended +type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended +type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended +type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended + +type ibg1JacExtended interface { + bucketg1JacExtendedC1 | + bucketg1JacExtendedC3 | + bucketg1JacExtendedC4 | + bucketg1JacExtendedC5 | + bucketg1JacExtendedC6 | + bucketg1JacExtendedC7 | + bucketg1JacExtendedC8 | + bucketg1JacExtendedC9 | + bucketg1JacExtendedC10 | + bucketg1JacExtendedC11 | + bucketg1JacExtendedC12 | + bucketg1JacExtendedC13 | + bucketg1JacExtendedC14 | + bucketg1JacExtendedC15 | + bucketg1JacExtendedC16 | + bucketg1JacExtendedC20 | + bucketg1JacExtendedC21 +} + +func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended +type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended +type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended +type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended +type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended +type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended +type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended +type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended +type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended +type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended +type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended +type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended +type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended + +type ibg2JacExtended interface { + bucketg2JacExtendedC1 | + bucketg2JacExtendedC3 | + bucketg2JacExtendedC4 | + bucketg2JacExtendedC5 | + bucketg2JacExtendedC6 | + bucketg2JacExtendedC7 | + bucketg2JacExtendedC8 | + bucketg2JacExtendedC9 | + bucketg2JacExtendedC10 | + bucketg2JacExtendedC11 | + bucketg2JacExtendedC12 | + bucketg2JacExtendedC13 | + bucketg2JacExtendedC14 | + bucketg2JacExtendedC15 | + bucketg2JacExtendedC16 | + bucketg2JacExtendedC20 | + bucketg2JacExtendedC21 +} diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index 8a8ee0e90d..7fbb203ce1 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) + innerMsmG1(&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} + cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -130,10 +130,10 @@ func TestMultiExpG1(t *testing.T) { results := make([]G1Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&results[i], int(c), samplePoints[:], scalars, false) + innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG1Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true) + innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -171,10 +171,10 @@ func TestMultiExpG1(t *testing.T) { results := make([]G1Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&results[i], int(c), samplePointsZero[:], scalars, false) + innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG1Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) + innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -209,8 +209,8 @@ func TestMultiExpG1(t *testing.T) { var result1, result2 G1Jac for _, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false) - msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + innerMsmG1(&result1, int(c), samplePoints[:], scalars, false) + innerMsmG1(&result2, int(c), samplePoints[:], scalars, false) if !result1.Equal(&result2) { return false } @@ -288,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) { b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { - testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) } @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) + innerMsmG2(&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -461,10 +461,10 @@ func TestMultiExpG2(t *testing.T) { results := make([]G2Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&results[i], int(c), samplePoints[:], scalars, false) + innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG2Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true) + innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -502,10 +502,10 @@ func TestMultiExpG2(t *testing.T) { results := make([]G2Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&results[i], int(c), samplePointsZero[:], scalars, false) + innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG2Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) + innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -540,8 +540,8 @@ func TestMultiExpG2(t *testing.T) { var result1, result2 G2Jac for _, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false) - msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + innerMsmG2(&result1, int(c), samplePoints[:], scalars, false) + innerMsmG2(&result2, int(c), samplePoints[:], scalars, false) if !result1.Equal(&result2) { return false } @@ -619,7 +619,7 @@ func BenchmarkMultiExpG2(b *testing.B) { b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { - testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) } diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index e49016a90e..ceb1ad7847 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -25,143 +25,6 @@ import ( "runtime" ) -// selector stores the index, mask and shifts needed to select bits from a scalar -// it is used during the multiExp algorithm or the batch scalar multiplication -type selector struct { - index uint64 // index in the multi-word scalar to select bits from - mask uint64 // mask (c-bit wide) - shift uint64 // shift needed to get our bits on low positions - - multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) - maskHigh uint64 // same than mask, for index+1 - shiftHigh uint64 // same than shift, for index+1 -} - -// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits -// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract -// 2^{c} to the current digit, making it negative. -// negative digits can be processed in a later step as adding -G into the bucket instead of G -// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) -// scalarsMont indicates wheter the provided scalars are in montgomery form -// returns smallValues, which represent the number of scalars which meets the following condition -// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { - toReturn := make([]fr.Element, len(scalars)) - - // number of c-bit radixes in a scalar - nbChunks := fr.Limbs * 64 / c - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window - max := int(1 << (c - 1)) // max value we want for our digits - cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words - - // compute offset and word selector / shift to select the right bits of our windows - selectors := make([]selector, nbChunks) - for chunk := uint64(0); chunk < nbChunks; chunk++ { - jc := uint64(chunk * c) - d := selector{} - d.index = jc / 64 - d.shift = jc - (d.index * 64) - d.mask = mask << d.shift - d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) - if d.multiWordSelect { - nbBitsHigh := d.shift - uint64(64-c) - d.maskHigh = (1 << nbBitsHigh) - 1 - d.shiftHigh = (c - nbBitsHigh) - } - selectors[chunk] = d - } - - // for each chunk, we could track the number of non-zeros points we will need to process - // this way, if a chunk has more work to do than others, we can spawn off more go routines - // (at the cost of more buckets allocated) - // a simplified approach is to track the small values where only the first word is set - // if this number represent a significant number of points, then we will split first chunk - // processing in the msm in 2, to ensure all go routines finish at ~same time - // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine - // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) - - parallel.Execute(len(scalars), func(start, end int) { - smallValues := 0 - for i := start; i < end; i++ { - var carry int - - scalar := scalars[i] - if scalarsMont { - scalar.FromMont() - } - if scalar.FitsOnOneWord() { - // everything is 0, no need to process this scalar - if scalar[0] == 0 { - continue - } - // low c-bits are 1 in mask - if scalar[0]&mask == scalar[0] { - smallValues++ - } - } - - // for each chunk in the scalar, compute the current digit, and an eventual carry - for chunk := uint64(0); chunk < nbChunks; chunk++ { - s := selectors[chunk] - - // init with carry if any - digit := carry - carry = 0 - - // digit = value of the c-bit window - digit += int((scalar[s.index] & s.mask) >> s.shift) - - if s.multiWordSelect { - // we are selecting bits over 2 words - digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh - } - - // if digit is zero, no impact on result - if digit == 0 { - continue - } - - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - if digit >= max { - digit -= (1 << c) - carry = 1 - } - - var bits uint64 - if digit >= 0 { - bits = uint64(digit) - } else { - bits = uint64(-digit-1) | msbWindow - } - - toReturn[i][s.index] |= (bits << s.shift) - if s.multiWordSelect { - toReturn[i][s.index+1] |= (bits >> s.shiftHigh) - } - - } - } - - chSmallValues <- smallValues - - }, nbTasks) - - // aggregate small values - close(chSmallValues) - smallValues := 0 - for o := range chSmallValues { - smallValues += o - } - return toReturn, smallValues -} - // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf // // This call return an error if len(scalars) != len(points) or if provided config is invalid. @@ -221,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} + implementedCs := []uint64{4, 5, 8, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -266,7 +129,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG1Jac , but that would incur a cost of looping through all scalars one more time + // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 // we have nbSplits intermediate results that we must sum together. @@ -276,12 +139,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul start := i * nbPoints end := start + nbPoints go func(start, end, i int) { - msmInnerG1Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) chDone <- i }(start, end, i) } - msmInnerG1Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) for i := 0; i < nbSplits-1; i++ { done := <-chDone p.AddAssign(&_p[done]) @@ -290,169 +153,34 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { - +func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { switch c { - case 1: - msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC2](p, 3, points, scalars, splitFirstChunk) - case 4: - msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) case 5: - msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC5](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC2](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC5](p, 7, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] + _innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processChunk) case 8: - msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC5](p, 9, points, scalars, splitFirstChunk) - - case 10: - msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC10](p, 10, points, scalars, splitFirstChunk) - - case 11: - msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC1](p, 11, points, scalars, splitFirstChunk) - - case 12: - msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC8](p, 12, points, scalars, splitFirstChunk) - - case 13: - msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC8](p, 13, points, scalars, splitFirstChunk) - - case 14: - msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC12](p, 14, points, scalars, splitFirstChunk) - - case 15: - msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC5](p, 15, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] + _innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) case 16: - msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC14](p, 17, points, scalars, splitFirstChunk) - - case 18: - msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC14](p, 18, points, scalars, splitFirstChunk) - - case 19: - msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC16](p, 19, points, scalars, splitFirstChunk) - - case 20: - msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC20](p, 20, points, scalars, splitFirstChunk) - - case 21: - msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC5](p, 21, points, scalars, splitFirstChunk) - - case 22: - msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC12](p, 22, points, scalars, splitFirstChunk) - - case 23: - msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC21](p, 23, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC16] + _innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") } } -// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { - var _p g1JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64, - chRes chan<- g1JacExtended, - c uint64, - points []G1Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) - - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac { - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) - } - } - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g1JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { nbChunks++ } + // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is @@ -464,45 +192,54 @@ func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, poi chChunks[i] = make(chan g1JacExtended, 1) } - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G1Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g1JacExtended, 1<<(lastC-1)) - // TODO @gbotrel last C restore. - msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } + // the last chunk may be processed with a different method than the rest, as it could be smaller. + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars) + for j := int(nbChunks - 2); j > 0; j-- { + go processChunk(uint64(j), chChunks[j], c, points, scalars) } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] + // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed + // in the ~same amount of time + if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + if !splitFirstChunk { + go processChunk(0, chChunks[0], c, points, scalars) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, chSplit, c, points[:split], scalars[:split]) + go processChunk(0, chSplit, c, points[split:], scalars[split:]) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } +// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { + var _p g1JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf // // This call return an error if len(scalars) != len(points) or if provided config is invalid. @@ -562,7 +299,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + implementedCs := []uint64{4, 5, 8, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -607,7 +344,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time + // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 // we have nbSplits intermediate results that we must sum together. @@ -617,12 +354,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul start := i * nbPoints end := start + nbPoints go func(start, end, i int) { - msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) chDone <- i }(start, end, i) } - msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) for i := 0; i < nbSplits-1; i++ { done := <-chDone p.AddAssign(&_p[done]) @@ -631,82 +368,75 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - +func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { switch c { - case 1: - msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC2](p, 3, points, scalars, splitFirstChunk) - case 4: - msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) case 5: - msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC5](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC2](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC5](p, 7, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] + _innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processChunk) case 8: - msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC5](p, 9, points, scalars, splitFirstChunk) - - case 10: - msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC10](p, 10, points, scalars, splitFirstChunk) - - case 11: - msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC1](p, 11, points, scalars, splitFirstChunk) - - case 12: - msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC8](p, 12, points, scalars, splitFirstChunk) - - case 13: - msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC8](p, 13, points, scalars, splitFirstChunk) - - case 14: - msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC12](p, 14, points, scalars, splitFirstChunk) - - case 15: - msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC5](p, 15, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] + _innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) case 16: - msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + processChunk := processChunkG2BatchAffine[bucketG2AffineC16] + _innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + default: + panic("not implemented") + } +} - case 17: - msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC14](p, 17, points, scalars, splitFirstChunk) +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac { - case 18: - msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC14](p, 18, points, scalars, splitFirstChunk) + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } - case 19: - msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC16](p, 19, points, scalars, splitFirstChunk) + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance - case 20: - msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC20](p, 20, points, scalars, splitFirstChunk) + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } - case 21: - msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC5](p, 21, points, scalars, splitFirstChunk) + // the last chunk may be processed with a different method than the rest, as it could be smaller. + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) - case 22: - msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC12](p, 22, points, scalars, splitFirstChunk) + for j := int(nbChunks - 2); j > 0; j-- { + go processChunk(uint64(j), chChunks[j], c, points, scalars) + } - case 23: - msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC21](p, 23, points, scalars, splitFirstChunk) + // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] + // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed + // in the ~same amount of time + if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + if !splitFirstChunk { + go processChunk(0, chChunks[0], c, points, scalars) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, chSplit, c, points[:split], scalars[:split]) + go processChunk(0, chSplit, c, points[split:], scalars[split:]) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } - default: - panic("not implemented") } + + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp @@ -725,121 +455,139 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - scalars []fr.Element) { +// selector stores the index, mask and shifts needed to select bits from a scalar +// it is used during the multiExp algorithm or the batch scalar multiplication +type selector struct { + index uint64 // index in the multi-word scalar to select bits from + mask uint64 // mask (c-bit wide) + shift uint64 // shift needed to get our bits on low positions - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) + maskHigh uint64 // same than mask, for index+1 + shiftHigh uint64 // same than shift, for index+1 +} - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { + toReturn := make([]fr.Element, len(scalars)) - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) } + selectors[chunk] = d } - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int - chRes <- total + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.FitsOnOneWord() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } -} + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] -func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // init with carry if any + digit := carry + carry = 0 - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G2Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g2JacExtended, 1<<(lastC-1)) - // TODO @gbotrel last C restore. - msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars) - } + // if digit is zero, no impact on result + if digit == 0 { + continue + } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + var bits uint64 + if digit >= 0 { + bits = uint64(digit) + } else { + bits = uint64(-digit-1) | msbWindow + } - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) + toReturn[i][s.index] |= (bits << s.shift) + if s.multiWordSelect { + toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues } diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index 5064b454a6..221079f874 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -17,11 +17,7 @@ package bw6633 import ( - "errors" - "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-633/fr" - "math" - "runtime" ) const MAX_BATCH_SIZE = 600 @@ -34,210 +30,118 @@ func (o batchOp) isNeg() bool { return o.pointID&1 == 1 } -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// processChunkG1BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. // -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { - var _p G1Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, + chRes chan<- g1JacExtended, + c uint64, + points []G1Affine, + scalars []fr.Element) { - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() } - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) } - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } + batch := newBatchG1Affine(&buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 + if bits == 0 { + continue } - } - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G1Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } } - - msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG1Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. } - close(chDone) - return p, nil -} - -func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 1: - msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC2](p, 3, points, scalars, splitFirstChunk) - - case 4: - msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - - case 5: - msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC5](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC2](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC5](p, 7, points, scalars, splitFirstChunk) - - case 8: - msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - case 9: - msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC5](p, 9, points, scalars, splitFirstChunk) - - case 10: - batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC10](p, 10, points, scalars, splitFirstChunk) - - case 11: - batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC1](p, 11, points, scalars, splitFirstChunk) - - case 12: - batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC8](p, 12, points, scalars, splitFirstChunk) - - case 13: - batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC8](p, 13, points, scalars, splitFirstChunk) - - case 14: - batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC12](p, 14, points, scalars, splitFirstChunk) - - case 15: - batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC5](p, 15, points, scalars, splitFirstChunk) - - case 16: - batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC14](p, 17, points, scalars, splitFirstChunk) - - case 18: - batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC14](p, 18, points, scalars, splitFirstChunk) + // flush items in batch. + batch.ExecuteAndReset() - case 19: - batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC16](p, 19, points, scalars, splitFirstChunk) + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - case 20: - batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC20](p, 20, points, scalars, splitFirstChunk) + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } - case 21: - batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC5](p, 21, points, scalars, splitFirstChunk) + chRes <- total - case 22: - batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC12](p, 22, points, scalars, splitFirstChunk) +} - case 23: - batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC21](p, 23, points, scalars, splitFirstChunk) +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketG1AffineC4 [1 << (4 - 1)]G1Affine +type bucketG1AffineC5 [1 << (5 - 1)]G1Affine +type bucketG1AffineC8 [1 << (8 - 1)]G1Affine +type bucketG1AffineC16 [1 << (16 - 1)]G1Affine - default: - panic("not implemented") - } +type ibG1Affine interface { + bucketG1AffineC4 | + bucketG1AffineC5 | + bucketG1AffineC8 | + bucketG1AffineC16 } type BatchG1Affine[B ibG1Affine] struct { @@ -347,10 +251,16 @@ func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B] } -func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, - chRes chan<- g1JacExtended, +// processChunkG2BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. +// +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, + chRes chan<- g2JacExtended, c uint64, - points []G1Affine, + points []G2Affine, scalars []fr.Element) { mask := uint64((1 << c) - 1) // low c bits are 1 @@ -372,7 +282,7 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, s.shiftHigh = (c - nbBitsHigh) } - batch := newBatchG1Affine(&buckets, points) + batch := newBatchG2Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 for i := 0; i < len(scalars); i++ { @@ -417,7 +327,7 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) // batch.ExecuteAndReset() for len(queue) != 0 { - queue = processQueueG1Affine(queue, &batch) + queue = processQueueG2Affine(queue, &batch) batch.ExecuteAndReset() // execute batch even if not full. } @@ -427,7 +337,7 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g1JacExtended + var runningSum, total g2JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { @@ -441,366 +351,18 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, } -func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g1JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G1Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g1JacExtended, 1<<(lastC-1)) - // TODO @gbotrel lastC restore. - msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, int(c), chChunks[:]) -} - -type bucketG1AffineC1 [1 << (1 - 1)]G1Affine -type bucketG1AffineC2 [1 << (2 - 1)]G1Affine -type bucketG1AffineC3 [1 << (3 - 1)]G1Affine -type bucketG1AffineC4 [1 << (4 - 1)]G1Affine -type bucketG1AffineC5 [1 << (5 - 1)]G1Affine -type bucketG1AffineC6 [1 << (6 - 1)]G1Affine -type bucketG1AffineC7 [1 << (7 - 1)]G1Affine -type bucketG1AffineC8 [1 << (8 - 1)]G1Affine -type bucketG1AffineC9 [1 << (9 - 1)]G1Affine -type bucketG1AffineC10 [1 << (10 - 1)]G1Affine -type bucketG1AffineC11 [1 << (11 - 1)]G1Affine -type bucketG1AffineC12 [1 << (12 - 1)]G1Affine -type bucketG1AffineC13 [1 << (13 - 1)]G1Affine -type bucketG1AffineC14 [1 << (14 - 1)]G1Affine -type bucketG1AffineC15 [1 << (15 - 1)]G1Affine -type bucketG1AffineC16 [1 << (16 - 1)]G1Affine -type bucketG1AffineC17 [1 << (17 - 1)]G1Affine -type bucketG1AffineC18 [1 << (18 - 1)]G1Affine -type bucketG1AffineC19 [1 << (19 - 1)]G1Affine -type bucketG1AffineC20 [1 << (20 - 1)]G1Affine -type bucketG1AffineC21 [1 << (21 - 1)]G1Affine -type bucketG1AffineC22 [1 << (22 - 1)]G1Affine -type bucketG1AffineC23 [1 << (23 - 1)]G1Affine -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended -type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended -type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended -type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended -type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended -type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended -type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended -type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended -type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended -type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended -type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended -type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended -type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended -type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended -type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended -type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended -type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended -type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended -type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended - -type ibG1Affine interface { - bucketG1AffineC1 | - bucketG1AffineC2 | - bucketG1AffineC3 | - bucketG1AffineC4 | - bucketG1AffineC5 | - bucketG1AffineC6 | - bucketG1AffineC7 | - bucketG1AffineC8 | - bucketG1AffineC9 | - bucketG1AffineC10 | - bucketG1AffineC11 | - bucketG1AffineC12 | - bucketG1AffineC13 | - bucketG1AffineC14 | - bucketG1AffineC15 | - bucketG1AffineC16 | - bucketG1AffineC17 | - bucketG1AffineC18 | - bucketG1AffineC19 | - bucketG1AffineC20 | - bucketG1AffineC21 | - bucketG1AffineC22 | - bucketG1AffineC23 -} - -type ibg1JacExtended interface { - bucketg1JacExtendedC1 | - bucketg1JacExtendedC2 | - bucketg1JacExtendedC3 | - bucketg1JacExtendedC4 | - bucketg1JacExtendedC5 | - bucketg1JacExtendedC6 | - bucketg1JacExtendedC7 | - bucketg1JacExtendedC8 | - bucketg1JacExtendedC9 | - bucketg1JacExtendedC10 | - bucketg1JacExtendedC11 | - bucketg1JacExtendedC12 | - bucketg1JacExtendedC13 | - bucketg1JacExtendedC14 | - bucketg1JacExtendedC15 | - bucketg1JacExtendedC16 | - bucketg1JacExtendedC17 | - bucketg1JacExtendedC18 | - bucketg1JacExtendedC19 | - bucketg1JacExtendedC20 | - bucketg1JacExtendedC21 | - bucketg1JacExtendedC22 | - bucketg1JacExtendedC23 -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 1: - msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC2](p, 3, points, scalars, splitFirstChunk) - - case 4: - msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - - case 5: - msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC5](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC2](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC5](p, 7, points, scalars, splitFirstChunk) - - case 8: - msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC5](p, 9, points, scalars, splitFirstChunk) - - case 10: - batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC10](p, 10, points, scalars, splitFirstChunk) - - case 11: - batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC1](p, 11, points, scalars, splitFirstChunk) - - case 12: - batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC8](p, 12, points, scalars, splitFirstChunk) - - case 13: - batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC8](p, 13, points, scalars, splitFirstChunk) - - case 14: - batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC12](p, 14, points, scalars, splitFirstChunk) - - case 15: - batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC5](p, 15, points, scalars, splitFirstChunk) - - case 16: - batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC14](p, 17, points, scalars, splitFirstChunk) - - case 18: - batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC14](p, 18, points, scalars, splitFirstChunk) - - case 19: - batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC16](p, 19, points, scalars, splitFirstChunk) - - case 20: - batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC20](p, 20, points, scalars, splitFirstChunk) - - case 21: - batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC5](p, 21, points, scalars, splitFirstChunk) - - case 22: - batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC12](p, 22, points, scalars, splitFirstChunk) - - case 23: - batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC21](p, 23, points, scalars, splitFirstChunk) +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketG2AffineC4 [1 << (4 - 1)]G2Affine +type bucketG2AffineC5 [1 << (5 - 1)]G2Affine +type bucketG2AffineC8 [1 << (8 - 1)]G2Affine +type bucketG2AffineC16 [1 << (16 - 1)]G2Affine - default: - panic("not implemented") - } +type ibG2Affine interface { + bucketG2AffineC4 | + bucketG2AffineC5 | + bucketG2AffineC8 | + bucketG2AffineC16 } type BatchG2Affine[B ibG2Affine] struct { @@ -909,253 +471,3 @@ func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B] return queue } - -func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - batch := newBatchG2Affine(&buckets, points) - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. - nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - op.bucketID = uint32(bits - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) - } else { - // sub - op.bucketID = (uint32(bits & ^msbWindow)) - op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) - } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() - nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - } - } else { - // put it in queue. - queue = append(queue, op) - } - } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() - for len(queue) != 0 { - queue = processQueueG2Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. - } - - // flush items in batch. - batch.ExecuteAndReset() - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { - runningSum.addMixed(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G2Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g2JacExtended, 1<<(lastC-1)) - // TODO @gbotrel lastC restore. - msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) -} - -type bucketG2AffineC1 [1 << (1 - 1)]G2Affine -type bucketG2AffineC2 [1 << (2 - 1)]G2Affine -type bucketG2AffineC3 [1 << (3 - 1)]G2Affine -type bucketG2AffineC4 [1 << (4 - 1)]G2Affine -type bucketG2AffineC5 [1 << (5 - 1)]G2Affine -type bucketG2AffineC6 [1 << (6 - 1)]G2Affine -type bucketG2AffineC7 [1 << (7 - 1)]G2Affine -type bucketG2AffineC8 [1 << (8 - 1)]G2Affine -type bucketG2AffineC9 [1 << (9 - 1)]G2Affine -type bucketG2AffineC10 [1 << (10 - 1)]G2Affine -type bucketG2AffineC11 [1 << (11 - 1)]G2Affine -type bucketG2AffineC12 [1 << (12 - 1)]G2Affine -type bucketG2AffineC13 [1 << (13 - 1)]G2Affine -type bucketG2AffineC14 [1 << (14 - 1)]G2Affine -type bucketG2AffineC15 [1 << (15 - 1)]G2Affine -type bucketG2AffineC16 [1 << (16 - 1)]G2Affine -type bucketG2AffineC17 [1 << (17 - 1)]G2Affine -type bucketG2AffineC18 [1 << (18 - 1)]G2Affine -type bucketG2AffineC19 [1 << (19 - 1)]G2Affine -type bucketG2AffineC20 [1 << (20 - 1)]G2Affine -type bucketG2AffineC21 [1 << (21 - 1)]G2Affine -type bucketG2AffineC22 [1 << (22 - 1)]G2Affine -type bucketG2AffineC23 [1 << (23 - 1)]G2Affine -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended -type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended -type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended -type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended -type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended -type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended -type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended -type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended -type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended -type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended -type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended -type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended -type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended -type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended -type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended -type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended -type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended -type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended -type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended - -type ibG2Affine interface { - bucketG2AffineC1 | - bucketG2AffineC2 | - bucketG2AffineC3 | - bucketG2AffineC4 | - bucketG2AffineC5 | - bucketG2AffineC6 | - bucketG2AffineC7 | - bucketG2AffineC8 | - bucketG2AffineC9 | - bucketG2AffineC10 | - bucketG2AffineC11 | - bucketG2AffineC12 | - bucketG2AffineC13 | - bucketG2AffineC14 | - bucketG2AffineC15 | - bucketG2AffineC16 | - bucketG2AffineC17 | - bucketG2AffineC18 | - bucketG2AffineC19 | - bucketG2AffineC20 | - bucketG2AffineC21 | - bucketG2AffineC22 | - bucketG2AffineC23 -} - -type ibg2JacExtended interface { - bucketg2JacExtendedC1 | - bucketg2JacExtendedC2 | - bucketg2JacExtendedC3 | - bucketg2JacExtendedC4 | - bucketg2JacExtendedC5 | - bucketg2JacExtendedC6 | - bucketg2JacExtendedC7 | - bucketg2JacExtendedC8 | - bucketg2JacExtendedC9 | - bucketg2JacExtendedC10 | - bucketg2JacExtendedC11 | - bucketg2JacExtendedC12 | - bucketg2JacExtendedC13 | - bucketg2JacExtendedC14 | - bucketg2JacExtendedC15 | - bucketg2JacExtendedC16 | - bucketg2JacExtendedC17 | - bucketg2JacExtendedC18 | - bucketg2JacExtendedC19 | - bucketg2JacExtendedC20 | - bucketg2JacExtendedC21 | - bucketg2JacExtendedC22 | - bucketg2JacExtendedC23 -} diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go new file mode 100644 index 0000000000..f331d07491 --- /dev/null +++ b/ecc/bw6-633/multiexp_jacobian.go @@ -0,0 +1,177 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package bw6633 + +import ( + "github.com/consensys/gnark-crypto/ecc/bw6-633/fr" +) + +func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, + chRes chan<- g1JacExtended, + c uint64, + points []G1Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended +type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended +type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended + +type ibg1JacExtended interface { + bucketg1JacExtendedC4 | + bucketg1JacExtendedC5 | + bucketg1JacExtendedC8 | + bucketg1JacExtendedC16 +} + +func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended +type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended +type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended + +type ibg2JacExtended interface { + bucketg2JacExtendedC4 | + bucketg2JacExtendedC5 | + bucketg2JacExtendedC8 | + bucketg2JacExtendedC16 +} diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index 32d3f4e986..0a8268cd6a 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) + innerMsmG1(&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} + cRange := []uint64{4, 5, 8, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -130,10 +130,10 @@ func TestMultiExpG1(t *testing.T) { results := make([]G1Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&results[i], int(c), samplePoints[:], scalars, false) + innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG1Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true) + innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -171,10 +171,10 @@ func TestMultiExpG1(t *testing.T) { results := make([]G1Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&results[i], int(c), samplePointsZero[:], scalars, false) + innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG1Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) + innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -209,8 +209,8 @@ func TestMultiExpG1(t *testing.T) { var result1, result2 G1Jac for _, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false) - msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + innerMsmG1(&result1, int(c), samplePoints[:], scalars, false) + innerMsmG1(&result2, int(c), samplePoints[:], scalars, false) if !result1.Equal(&result2) { return false } @@ -288,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) { b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { - testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) } @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) + innerMsmG2(&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -461,10 +461,10 @@ func TestMultiExpG2(t *testing.T) { results := make([]G2Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&results[i], int(c), samplePoints[:], scalars, false) + innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG2Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true) + innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -502,10 +502,10 @@ func TestMultiExpG2(t *testing.T) { results := make([]G2Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&results[i], int(c), samplePointsZero[:], scalars, false) + innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG2Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) + innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -540,8 +540,8 @@ func TestMultiExpG2(t *testing.T) { var result1, result2 G2Jac for _, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false) - msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + innerMsmG2(&result1, int(c), samplePoints[:], scalars, false) + innerMsmG2(&result2, int(c), samplePoints[:], scalars, false) if !result1.Equal(&result2) { return false } @@ -619,7 +619,7 @@ func BenchmarkMultiExpG2(b *testing.B) { b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { - testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) } diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index 2ab93e7fd9..ee5ff35a9a 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -25,143 +25,6 @@ import ( "runtime" ) -// selector stores the index, mask and shifts needed to select bits from a scalar -// it is used during the multiExp algorithm or the batch scalar multiplication -type selector struct { - index uint64 // index in the multi-word scalar to select bits from - mask uint64 // mask (c-bit wide) - shift uint64 // shift needed to get our bits on low positions - - multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) - maskHigh uint64 // same than mask, for index+1 - shiftHigh uint64 // same than shift, for index+1 -} - -// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits -// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract -// 2^{c} to the current digit, making it negative. -// negative digits can be processed in a later step as adding -G into the bucket instead of G -// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) -// scalarsMont indicates wheter the provided scalars are in montgomery form -// returns smallValues, which represent the number of scalars which meets the following condition -// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { - toReturn := make([]fr.Element, len(scalars)) - - // number of c-bit radixes in a scalar - nbChunks := fr.Limbs * 64 / c - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window - max := int(1 << (c - 1)) // max value we want for our digits - cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words - - // compute offset and word selector / shift to select the right bits of our windows - selectors := make([]selector, nbChunks) - for chunk := uint64(0); chunk < nbChunks; chunk++ { - jc := uint64(chunk * c) - d := selector{} - d.index = jc / 64 - d.shift = jc - (d.index * 64) - d.mask = mask << d.shift - d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) - if d.multiWordSelect { - nbBitsHigh := d.shift - uint64(64-c) - d.maskHigh = (1 << nbBitsHigh) - 1 - d.shiftHigh = (c - nbBitsHigh) - } - selectors[chunk] = d - } - - // for each chunk, we could track the number of non-zeros points we will need to process - // this way, if a chunk has more work to do than others, we can spawn off more go routines - // (at the cost of more buckets allocated) - // a simplified approach is to track the small values where only the first word is set - // if this number represent a significant number of points, then we will split first chunk - // processing in the msm in 2, to ensure all go routines finish at ~same time - // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine - // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) - - parallel.Execute(len(scalars), func(start, end int) { - smallValues := 0 - for i := start; i < end; i++ { - var carry int - - scalar := scalars[i] - if scalarsMont { - scalar.FromMont() - } - if scalar.FitsOnOneWord() { - // everything is 0, no need to process this scalar - if scalar[0] == 0 { - continue - } - // low c-bits are 1 in mask - if scalar[0]&mask == scalar[0] { - smallValues++ - } - } - - // for each chunk in the scalar, compute the current digit, and an eventual carry - for chunk := uint64(0); chunk < nbChunks; chunk++ { - s := selectors[chunk] - - // init with carry if any - digit := carry - carry = 0 - - // digit = value of the c-bit window - digit += int((scalar[s.index] & s.mask) >> s.shift) - - if s.multiWordSelect { - // we are selecting bits over 2 words - digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh - } - - // if digit is zero, no impact on result - if digit == 0 { - continue - } - - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - if digit >= max { - digit -= (1 << c) - carry = 1 - } - - var bits uint64 - if digit >= 0 { - bits = uint64(digit) - } else { - bits = uint64(-digit-1) | msbWindow - } - - toReturn[i][s.index] |= (bits << s.shift) - if s.multiWordSelect { - toReturn[i][s.index+1] |= (bits >> s.shiftHigh) - } - - } - } - - chSmallValues <- smallValues - - }, nbTasks) - - // aggregate small values - close(chSmallValues) - smallValues := 0 - for o := range chSmallValues { - smallValues += o - } - return toReturn, smallValues -} - // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf // // This call return an error if len(scalars) != len(points) or if provided config is invalid. @@ -221,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} + implementedCs := []uint64{4, 5, 8, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -266,7 +129,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG1Jac , but that would incur a cost of looping through all scalars one more time + // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 // we have nbSplits intermediate results that we must sum together. @@ -276,12 +139,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul start := i * nbPoints end := start + nbPoints go func(start, end, i int) { - msmInnerG1Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) chDone <- i }(start, end, i) } - msmInnerG1Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) for i := 0; i < nbSplits-1; i++ { done := <-chDone p.AddAssign(&_p[done]) @@ -290,169 +153,35 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { - +func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { switch c { - case 1: - msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC3](p, 3, points, scalars, splitFirstChunk) - case 4: - msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) case 5: - msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC4](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC6](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC6](p, 7, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 8: - msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC6](p, 9, points, scalars, splitFirstChunk) - - case 10: - msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC4](p, 10, points, scalars, splitFirstChunk) - - case 11: - msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC10](p, 11, points, scalars, splitFirstChunk) - - case 12: - msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC12](p, 12, points, scalars, splitFirstChunk) - - case 13: - msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC7](p, 13, points, scalars, splitFirstChunk) - - case 14: - msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC6](p, 14, points, scalars, splitFirstChunk) - - case 15: - msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC9](p, 15, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] + _innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) case 16: - msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC10](p, 17, points, scalars, splitFirstChunk) - - case 18: - msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC6](p, 18, points, scalars, splitFirstChunk) - - case 19: - msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC4](p, 19, points, scalars, splitFirstChunk) - - case 20: - msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC4](p, 20, points, scalars, splitFirstChunk) - - case 21: - msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC6](p, 21, points, scalars, splitFirstChunk) - - case 22: - msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC10](p, 22, points, scalars, splitFirstChunk) - - case 23: - msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC16](p, 23, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC16] + _innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") } } -// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { - var _p g1JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64, - chRes chan<- g1JacExtended, - c uint64, - points []G1Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) - - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac { - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) - } - } - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g1JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { nbChunks++ } + // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is @@ -464,45 +193,54 @@ func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, poi chChunks[i] = make(chan g1JacExtended, 1) } - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G1Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g1JacExtended, 1<<(lastC-1)) - // TODO @gbotrel last C restore. - msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } + // the last chunk may be processed with a different method than the rest, as it could be smaller. + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars) + for j := int(nbChunks - 2); j > 0; j-- { + go processChunk(uint64(j), chChunks[j], c, points, scalars) } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] + // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed + // in the ~same amount of time + if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + if !splitFirstChunk { + go processChunk(0, chChunks[0], c, points, scalars) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, chSplit, c, points[:split], scalars[:split]) + go processChunk(0, chSplit, c, points[split:], scalars[split:]) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } +// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { + var _p g1JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf // // This call return an error if len(scalars) != len(points) or if provided config is invalid. @@ -562,7 +300,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + implementedCs := []uint64{4, 5, 8, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -607,7 +345,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time + // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 // we have nbSplits intermediate results that we must sum together. @@ -617,12 +355,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul start := i * nbPoints end := start + nbPoints go func(start, end, i int) { - msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) chDone <- i }(start, end, i) } - msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) for i := 0; i < nbSplits-1; i++ { done := <-chDone p.AddAssign(&_p[done]) @@ -631,82 +369,76 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - +func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { switch c { - case 1: - msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC3](p, 3, points, scalars, splitFirstChunk) - case 4: - msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) case 5: - msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC4](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC6](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC6](p, 7, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 8: - msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC6](p, 9, points, scalars, splitFirstChunk) - - case 10: - msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC4](p, 10, points, scalars, splitFirstChunk) - - case 11: - msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC10](p, 11, points, scalars, splitFirstChunk) - - case 12: - msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC12](p, 12, points, scalars, splitFirstChunk) - - case 13: - msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC7](p, 13, points, scalars, splitFirstChunk) - - case 14: - msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC6](p, 14, points, scalars, splitFirstChunk) - - case 15: - msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC9](p, 15, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] + _innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) case 16: - msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + processChunk := processChunkG2BatchAffine[bucketG2AffineC16] + _innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + default: + panic("not implemented") + } +} - case 17: - msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC10](p, 17, points, scalars, splitFirstChunk) +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac { - case 18: - msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC6](p, 18, points, scalars, splitFirstChunk) + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } - case 19: - msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC4](p, 19, points, scalars, splitFirstChunk) + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance - case 20: - msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC4](p, 20, points, scalars, splitFirstChunk) + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } - case 21: - msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC6](p, 21, points, scalars, splitFirstChunk) + // the last chunk may be processed with a different method than the rest, as it could be smaller. + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) - case 22: - msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC10](p, 22, points, scalars, splitFirstChunk) + for j := int(nbChunks - 2); j > 0; j-- { + go processChunk(uint64(j), chChunks[j], c, points, scalars) + } - case 23: - msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC16](p, 23, points, scalars, splitFirstChunk) + // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] + // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed + // in the ~same amount of time + if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + if !splitFirstChunk { + go processChunk(0, chChunks[0], c, points, scalars) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, chSplit, c, points[:split], scalars[:split]) + go processChunk(0, chSplit, c, points[split:], scalars[split:]) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } - default: - panic("not implemented") } + + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp @@ -725,121 +457,139 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - scalars []fr.Element) { +// selector stores the index, mask and shifts needed to select bits from a scalar +// it is used during the multiExp algorithm or the batch scalar multiplication +type selector struct { + index uint64 // index in the multi-word scalar to select bits from + mask uint64 // mask (c-bit wide) + shift uint64 // shift needed to get our bits on low positions - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) + maskHigh uint64 // same than mask, for index+1 + shiftHigh uint64 // same than shift, for index+1 +} - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { + toReturn := make([]fr.Element, len(scalars)) - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) } + selectors[chunk] = d } - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int - chRes <- total + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.FitsOnOneWord() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } -} + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] -func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // init with carry if any + digit := carry + carry = 0 - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G2Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g2JacExtended, 1<<(lastC-1)) - // TODO @gbotrel last C restore. - msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars) - } + // if digit is zero, no impact on result + if digit == 0 { + continue + } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + var bits uint64 + if digit >= 0 { + bits = uint64(digit) + } else { + bits = uint64(-digit-1) | msbWindow + } - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) + toReturn[i][s.index] |= (bits << s.shift) + if s.multiWordSelect { + toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues } diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index 3b533e9059..537221cb69 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -17,11 +17,7 @@ package bw6756 import ( - "errors" - "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-756/fr" - "math" - "runtime" ) const MAX_BATCH_SIZE = 600 @@ -34,210 +30,118 @@ func (o batchOp) isNeg() bool { return o.pointID&1 == 1 } -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// processChunkG1BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. // -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { - var _p G1Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, + chRes chan<- g1JacExtended, + c uint64, + points []G1Affine, + scalars []fr.Element) { - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() } - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) } - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } + batch := newBatchG1Affine(&buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 + if bits == 0 { + continue } - } - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G1Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } } - - msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG1Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. } - close(chDone) - return p, nil -} - -func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 1: - msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC3](p, 3, points, scalars, splitFirstChunk) - - case 4: - msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - - case 5: - msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC4](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC6](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC6](p, 7, points, scalars, splitFirstChunk) - - case 8: - msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - case 9: - msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC6](p, 9, points, scalars, splitFirstChunk) - - case 10: - batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC4](p, 10, points, scalars, splitFirstChunk) - - case 11: - batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC10](p, 11, points, scalars, splitFirstChunk) - - case 12: - batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC12](p, 12, points, scalars, splitFirstChunk) - - case 13: - batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC7](p, 13, points, scalars, splitFirstChunk) - - case 14: - batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC6](p, 14, points, scalars, splitFirstChunk) - - case 15: - batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC9](p, 15, points, scalars, splitFirstChunk) - - case 16: - batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC10](p, 17, points, scalars, splitFirstChunk) - - case 18: - batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC6](p, 18, points, scalars, splitFirstChunk) + // flush items in batch. + batch.ExecuteAndReset() - case 19: - batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC4](p, 19, points, scalars, splitFirstChunk) + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - case 20: - batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC4](p, 20, points, scalars, splitFirstChunk) + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } - case 21: - batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC6](p, 21, points, scalars, splitFirstChunk) + chRes <- total - case 22: - batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC10](p, 22, points, scalars, splitFirstChunk) +} - case 23: - batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC16](p, 23, points, scalars, splitFirstChunk) +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketG1AffineC4 [1 << (4 - 1)]G1Affine +type bucketG1AffineC5 [1 << (5 - 1)]G1Affine +type bucketG1AffineC8 [1 << (8 - 1)]G1Affine +type bucketG1AffineC16 [1 << (16 - 1)]G1Affine - default: - panic("not implemented") - } +type ibG1Affine interface { + bucketG1AffineC4 | + bucketG1AffineC5 | + bucketG1AffineC8 | + bucketG1AffineC16 } type BatchG1Affine[B ibG1Affine] struct { @@ -347,10 +251,16 @@ func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B] } -func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, - chRes chan<- g1JacExtended, +// processChunkG2BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. +// +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, + chRes chan<- g2JacExtended, c uint64, - points []G1Affine, + points []G2Affine, scalars []fr.Element) { mask := uint64((1 << c) - 1) // low c bits are 1 @@ -372,7 +282,7 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, s.shiftHigh = (c - nbBitsHigh) } - batch := newBatchG1Affine(&buckets, points) + batch := newBatchG2Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 for i := 0; i < len(scalars); i++ { @@ -417,7 +327,7 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) // batch.ExecuteAndReset() for len(queue) != 0 { - queue = processQueueG1Affine(queue, &batch) + queue = processQueueG2Affine(queue, &batch) batch.ExecuteAndReset() // execute batch even if not full. } @@ -427,7 +337,7 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g1JacExtended + var runningSum, total g2JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { @@ -441,366 +351,18 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, } -func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g1JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G1Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g1JacExtended, 1<<(lastC-1)) - // TODO @gbotrel lastC restore. - msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, int(c), chChunks[:]) -} - -type bucketG1AffineC1 [1 << (1 - 1)]G1Affine -type bucketG1AffineC2 [1 << (2 - 1)]G1Affine -type bucketG1AffineC3 [1 << (3 - 1)]G1Affine -type bucketG1AffineC4 [1 << (4 - 1)]G1Affine -type bucketG1AffineC5 [1 << (5 - 1)]G1Affine -type bucketG1AffineC6 [1 << (6 - 1)]G1Affine -type bucketG1AffineC7 [1 << (7 - 1)]G1Affine -type bucketG1AffineC8 [1 << (8 - 1)]G1Affine -type bucketG1AffineC9 [1 << (9 - 1)]G1Affine -type bucketG1AffineC10 [1 << (10 - 1)]G1Affine -type bucketG1AffineC11 [1 << (11 - 1)]G1Affine -type bucketG1AffineC12 [1 << (12 - 1)]G1Affine -type bucketG1AffineC13 [1 << (13 - 1)]G1Affine -type bucketG1AffineC14 [1 << (14 - 1)]G1Affine -type bucketG1AffineC15 [1 << (15 - 1)]G1Affine -type bucketG1AffineC16 [1 << (16 - 1)]G1Affine -type bucketG1AffineC17 [1 << (17 - 1)]G1Affine -type bucketG1AffineC18 [1 << (18 - 1)]G1Affine -type bucketG1AffineC19 [1 << (19 - 1)]G1Affine -type bucketG1AffineC20 [1 << (20 - 1)]G1Affine -type bucketG1AffineC21 [1 << (21 - 1)]G1Affine -type bucketG1AffineC22 [1 << (22 - 1)]G1Affine -type bucketG1AffineC23 [1 << (23 - 1)]G1Affine -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended -type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended -type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended -type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended -type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended -type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended -type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended -type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended -type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended -type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended -type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended -type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended -type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended -type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended -type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended -type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended -type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended -type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended -type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended - -type ibG1Affine interface { - bucketG1AffineC1 | - bucketG1AffineC2 | - bucketG1AffineC3 | - bucketG1AffineC4 | - bucketG1AffineC5 | - bucketG1AffineC6 | - bucketG1AffineC7 | - bucketG1AffineC8 | - bucketG1AffineC9 | - bucketG1AffineC10 | - bucketG1AffineC11 | - bucketG1AffineC12 | - bucketG1AffineC13 | - bucketG1AffineC14 | - bucketG1AffineC15 | - bucketG1AffineC16 | - bucketG1AffineC17 | - bucketG1AffineC18 | - bucketG1AffineC19 | - bucketG1AffineC20 | - bucketG1AffineC21 | - bucketG1AffineC22 | - bucketG1AffineC23 -} - -type ibg1JacExtended interface { - bucketg1JacExtendedC1 | - bucketg1JacExtendedC2 | - bucketg1JacExtendedC3 | - bucketg1JacExtendedC4 | - bucketg1JacExtendedC5 | - bucketg1JacExtendedC6 | - bucketg1JacExtendedC7 | - bucketg1JacExtendedC8 | - bucketg1JacExtendedC9 | - bucketg1JacExtendedC10 | - bucketg1JacExtendedC11 | - bucketg1JacExtendedC12 | - bucketg1JacExtendedC13 | - bucketg1JacExtendedC14 | - bucketg1JacExtendedC15 | - bucketg1JacExtendedC16 | - bucketg1JacExtendedC17 | - bucketg1JacExtendedC18 | - bucketg1JacExtendedC19 | - bucketg1JacExtendedC20 | - bucketg1JacExtendedC21 | - bucketg1JacExtendedC22 | - bucketg1JacExtendedC23 -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 1: - msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC3](p, 3, points, scalars, splitFirstChunk) - - case 4: - msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - - case 5: - msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC4](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC6](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC6](p, 7, points, scalars, splitFirstChunk) - - case 8: - msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC6](p, 9, points, scalars, splitFirstChunk) - - case 10: - batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC4](p, 10, points, scalars, splitFirstChunk) - - case 11: - batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC10](p, 11, points, scalars, splitFirstChunk) - - case 12: - batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC12](p, 12, points, scalars, splitFirstChunk) - - case 13: - batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC7](p, 13, points, scalars, splitFirstChunk) - - case 14: - batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC6](p, 14, points, scalars, splitFirstChunk) - - case 15: - batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC9](p, 15, points, scalars, splitFirstChunk) - - case 16: - batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC10](p, 17, points, scalars, splitFirstChunk) - - case 18: - batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC6](p, 18, points, scalars, splitFirstChunk) - - case 19: - batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC4](p, 19, points, scalars, splitFirstChunk) - - case 20: - batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC4](p, 20, points, scalars, splitFirstChunk) - - case 21: - batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC6](p, 21, points, scalars, splitFirstChunk) - - case 22: - batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC10](p, 22, points, scalars, splitFirstChunk) - - case 23: - batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC16](p, 23, points, scalars, splitFirstChunk) +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketG2AffineC4 [1 << (4 - 1)]G2Affine +type bucketG2AffineC5 [1 << (5 - 1)]G2Affine +type bucketG2AffineC8 [1 << (8 - 1)]G2Affine +type bucketG2AffineC16 [1 << (16 - 1)]G2Affine - default: - panic("not implemented") - } +type ibG2Affine interface { + bucketG2AffineC4 | + bucketG2AffineC5 | + bucketG2AffineC8 | + bucketG2AffineC16 } type BatchG2Affine[B ibG2Affine] struct { @@ -909,253 +471,3 @@ func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B] return queue } - -func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - batch := newBatchG2Affine(&buckets, points) - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. - nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - op.bucketID = uint32(bits - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) - } else { - // sub - op.bucketID = (uint32(bits & ^msbWindow)) - op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) - } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() - nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - } - } else { - // put it in queue. - queue = append(queue, op) - } - } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() - for len(queue) != 0 { - queue = processQueueG2Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. - } - - // flush items in batch. - batch.ExecuteAndReset() - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { - runningSum.addMixed(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G2Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g2JacExtended, 1<<(lastC-1)) - // TODO @gbotrel lastC restore. - msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) -} - -type bucketG2AffineC1 [1 << (1 - 1)]G2Affine -type bucketG2AffineC2 [1 << (2 - 1)]G2Affine -type bucketG2AffineC3 [1 << (3 - 1)]G2Affine -type bucketG2AffineC4 [1 << (4 - 1)]G2Affine -type bucketG2AffineC5 [1 << (5 - 1)]G2Affine -type bucketG2AffineC6 [1 << (6 - 1)]G2Affine -type bucketG2AffineC7 [1 << (7 - 1)]G2Affine -type bucketG2AffineC8 [1 << (8 - 1)]G2Affine -type bucketG2AffineC9 [1 << (9 - 1)]G2Affine -type bucketG2AffineC10 [1 << (10 - 1)]G2Affine -type bucketG2AffineC11 [1 << (11 - 1)]G2Affine -type bucketG2AffineC12 [1 << (12 - 1)]G2Affine -type bucketG2AffineC13 [1 << (13 - 1)]G2Affine -type bucketG2AffineC14 [1 << (14 - 1)]G2Affine -type bucketG2AffineC15 [1 << (15 - 1)]G2Affine -type bucketG2AffineC16 [1 << (16 - 1)]G2Affine -type bucketG2AffineC17 [1 << (17 - 1)]G2Affine -type bucketG2AffineC18 [1 << (18 - 1)]G2Affine -type bucketG2AffineC19 [1 << (19 - 1)]G2Affine -type bucketG2AffineC20 [1 << (20 - 1)]G2Affine -type bucketG2AffineC21 [1 << (21 - 1)]G2Affine -type bucketG2AffineC22 [1 << (22 - 1)]G2Affine -type bucketG2AffineC23 [1 << (23 - 1)]G2Affine -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended -type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended -type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended -type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended -type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended -type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended -type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended -type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended -type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended -type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended -type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended -type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended -type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended -type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended -type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended -type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended -type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended -type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended -type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended - -type ibG2Affine interface { - bucketG2AffineC1 | - bucketG2AffineC2 | - bucketG2AffineC3 | - bucketG2AffineC4 | - bucketG2AffineC5 | - bucketG2AffineC6 | - bucketG2AffineC7 | - bucketG2AffineC8 | - bucketG2AffineC9 | - bucketG2AffineC10 | - bucketG2AffineC11 | - bucketG2AffineC12 | - bucketG2AffineC13 | - bucketG2AffineC14 | - bucketG2AffineC15 | - bucketG2AffineC16 | - bucketG2AffineC17 | - bucketG2AffineC18 | - bucketG2AffineC19 | - bucketG2AffineC20 | - bucketG2AffineC21 | - bucketG2AffineC22 | - bucketG2AffineC23 -} - -type ibg2JacExtended interface { - bucketg2JacExtendedC1 | - bucketg2JacExtendedC2 | - bucketg2JacExtendedC3 | - bucketg2JacExtendedC4 | - bucketg2JacExtendedC5 | - bucketg2JacExtendedC6 | - bucketg2JacExtendedC7 | - bucketg2JacExtendedC8 | - bucketg2JacExtendedC9 | - bucketg2JacExtendedC10 | - bucketg2JacExtendedC11 | - bucketg2JacExtendedC12 | - bucketg2JacExtendedC13 | - bucketg2JacExtendedC14 | - bucketg2JacExtendedC15 | - bucketg2JacExtendedC16 | - bucketg2JacExtendedC17 | - bucketg2JacExtendedC18 | - bucketg2JacExtendedC19 | - bucketg2JacExtendedC20 | - bucketg2JacExtendedC21 | - bucketg2JacExtendedC22 | - bucketg2JacExtendedC23 -} diff --git a/ecc/bw6-756/multiexp_jacobian.go b/ecc/bw6-756/multiexp_jacobian.go new file mode 100644 index 0000000000..9dc8862130 --- /dev/null +++ b/ecc/bw6-756/multiexp_jacobian.go @@ -0,0 +1,177 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package bw6756 + +import ( + "github.com/consensys/gnark-crypto/ecc/bw6-756/fr" +) + +func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, + chRes chan<- g1JacExtended, + c uint64, + points []G1Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended +type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended +type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended + +type ibg1JacExtended interface { + bucketg1JacExtendedC4 | + bucketg1JacExtendedC5 | + bucketg1JacExtendedC8 | + bucketg1JacExtendedC16 +} + +func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended +type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended +type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended + +type ibg2JacExtended interface { + bucketg2JacExtendedC4 | + bucketg2JacExtendedC5 | + bucketg2JacExtendedC8 | + bucketg2JacExtendedC16 +} diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index 6cbf26cdfa..0d0384701c 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) + innerMsmG1(&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} + cRange := []uint64{4, 5, 8, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -130,10 +130,10 @@ func TestMultiExpG1(t *testing.T) { results := make([]G1Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&results[i], int(c), samplePoints[:], scalars, false) + innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG1Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true) + innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -171,10 +171,10 @@ func TestMultiExpG1(t *testing.T) { results := make([]G1Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&results[i], int(c), samplePointsZero[:], scalars, false) + innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG1Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) + innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -209,8 +209,8 @@ func TestMultiExpG1(t *testing.T) { var result1, result2 G1Jac for _, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false) - msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + innerMsmG1(&result1, int(c), samplePoints[:], scalars, false) + innerMsmG1(&result2, int(c), samplePoints[:], scalars, false) if !result1.Equal(&result2) { return false } @@ -288,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) { b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { - testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) } @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) + innerMsmG2(&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -461,10 +461,10 @@ func TestMultiExpG2(t *testing.T) { results := make([]G2Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&results[i], int(c), samplePoints[:], scalars, false) + innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG2Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true) + innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -502,10 +502,10 @@ func TestMultiExpG2(t *testing.T) { results := make([]G2Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&results[i], int(c), samplePointsZero[:], scalars, false) + innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG2Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) + innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -540,8 +540,8 @@ func TestMultiExpG2(t *testing.T) { var result1, result2 G2Jac for _, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false) - msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + innerMsmG2(&result1, int(c), samplePoints[:], scalars, false) + innerMsmG2(&result2, int(c), samplePoints[:], scalars, false) if !result1.Equal(&result2) { return false } @@ -619,7 +619,7 @@ func BenchmarkMultiExpG2(b *testing.B) { b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { - testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) } diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index cfaef03004..d9da35c23d 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -25,143 +25,6 @@ import ( "runtime" ) -// selector stores the index, mask and shifts needed to select bits from a scalar -// it is used during the multiExp algorithm or the batch scalar multiplication -type selector struct { - index uint64 // index in the multi-word scalar to select bits from - mask uint64 // mask (c-bit wide) - shift uint64 // shift needed to get our bits on low positions - - multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) - maskHigh uint64 // same than mask, for index+1 - shiftHigh uint64 // same than shift, for index+1 -} - -// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits -// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract -// 2^{c} to the current digit, making it negative. -// negative digits can be processed in a later step as adding -G into the bucket instead of G -// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) -// scalarsMont indicates wheter the provided scalars are in montgomery form -// returns smallValues, which represent the number of scalars which meets the following condition -// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { - toReturn := make([]fr.Element, len(scalars)) - - // number of c-bit radixes in a scalar - nbChunks := fr.Limbs * 64 / c - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window - max := int(1 << (c - 1)) // max value we want for our digits - cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words - - // compute offset and word selector / shift to select the right bits of our windows - selectors := make([]selector, nbChunks) - for chunk := uint64(0); chunk < nbChunks; chunk++ { - jc := uint64(chunk * c) - d := selector{} - d.index = jc / 64 - d.shift = jc - (d.index * 64) - d.mask = mask << d.shift - d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) - if d.multiWordSelect { - nbBitsHigh := d.shift - uint64(64-c) - d.maskHigh = (1 << nbBitsHigh) - 1 - d.shiftHigh = (c - nbBitsHigh) - } - selectors[chunk] = d - } - - // for each chunk, we could track the number of non-zeros points we will need to process - // this way, if a chunk has more work to do than others, we can spawn off more go routines - // (at the cost of more buckets allocated) - // a simplified approach is to track the small values where only the first word is set - // if this number represent a significant number of points, then we will split first chunk - // processing in the msm in 2, to ensure all go routines finish at ~same time - // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine - // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) - - parallel.Execute(len(scalars), func(start, end int) { - smallValues := 0 - for i := start; i < end; i++ { - var carry int - - scalar := scalars[i] - if scalarsMont { - scalar.FromMont() - } - if scalar.FitsOnOneWord() { - // everything is 0, no need to process this scalar - if scalar[0] == 0 { - continue - } - // low c-bits are 1 in mask - if scalar[0]&mask == scalar[0] { - smallValues++ - } - } - - // for each chunk in the scalar, compute the current digit, and an eventual carry - for chunk := uint64(0); chunk < nbChunks; chunk++ { - s := selectors[chunk] - - // init with carry if any - digit := carry - carry = 0 - - // digit = value of the c-bit window - digit += int((scalar[s.index] & s.mask) >> s.shift) - - if s.multiWordSelect { - // we are selecting bits over 2 words - digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh - } - - // if digit is zero, no impact on result - if digit == 0 { - continue - } - - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - if digit >= max { - digit -= (1 << c) - carry = 1 - } - - var bits uint64 - if digit >= 0 { - bits = uint64(digit) - } else { - bits = uint64(-digit-1) | msbWindow - } - - toReturn[i][s.index] |= (bits << s.shift) - if s.multiWordSelect { - toReturn[i][s.index+1] |= (bits >> s.shiftHigh) - } - - } - } - - chSmallValues <- smallValues - - }, nbTasks) - - // aggregate small values - close(chSmallValues) - smallValues := 0 - for o := range chSmallValues { - smallValues += o - } - return toReturn, smallValues -} - // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf // // This call return an error if len(scalars) != len(points) or if provided config is invalid. @@ -221,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} + implementedCs := []uint64{4, 5, 8, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -266,7 +129,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG1Jac , but that would incur a cost of looping through all scalars one more time + // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 // we have nbSplits intermediate results that we must sum together. @@ -276,12 +139,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul start := i * nbPoints end := start + nbPoints go func(start, end, i int) { - msmInnerG1Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) chDone <- i }(start, end, i) } - msmInnerG1Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) for i := 0; i < nbSplits-1; i++ { done := <-chDone p.AddAssign(&_p[done]) @@ -290,169 +153,35 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { - +func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { switch c { - case 1: - msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC3](p, 3, points, scalars, splitFirstChunk) - case 4: - msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) case 5: - msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC4](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC6](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC6](p, 7, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 8: - msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC6](p, 9, points, scalars, splitFirstChunk) - - case 10: - msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC4](p, 10, points, scalars, splitFirstChunk) - - case 11: - msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC10](p, 11, points, scalars, splitFirstChunk) - - case 12: - msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC12](p, 12, points, scalars, splitFirstChunk) - - case 13: - msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC7](p, 13, points, scalars, splitFirstChunk) - - case 14: - msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC6](p, 14, points, scalars, splitFirstChunk) - - case 15: - msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC9](p, 15, points, scalars, splitFirstChunk) - + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] + _innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) case 16: - msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC10](p, 17, points, scalars, splitFirstChunk) - - case 18: - msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC6](p, 18, points, scalars, splitFirstChunk) - - case 19: - msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC4](p, 19, points, scalars, splitFirstChunk) - - case 20: - msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC4](p, 20, points, scalars, splitFirstChunk) - - case 21: - msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC6](p, 21, points, scalars, splitFirstChunk) - - case 22: - msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC10](p, 22, points, scalars, splitFirstChunk) - - case 23: - msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC16](p, 23, points, scalars, splitFirstChunk) - + processChunk := processChunkG1BatchAffine[bucketG1AffineC16] + _innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") } } -// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { - var _p g1JacExtended - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - -func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64, - chRes chan<- g1JacExtended, - c uint64, - points []G1Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) - - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac { - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) - } - } - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g1JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { nbChunks++ } + // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window // note that buckets is an array allocated on the stack (for most sizes of c) and this is @@ -464,45 +193,54 @@ func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, poi chChunks[i] = make(chan g1JacExtended, 1) } - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G1Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g1JacExtended, 1<<(lastC-1)) - // TODO @gbotrel last C restore. - msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } + // the last chunk may be processed with a different method than the rest, as it could be smaller. + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars) + for j := int(nbChunks - 2); j > 0; j-- { + go processChunk(uint64(j), chChunks[j], c, points, scalars) } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] + // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed + // in the ~same amount of time + if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + if !splitFirstChunk { + go processChunk(0, chChunks[0], c, points, scalars) + } else { + chSplit := make(chan g1JacExtended, 2) + split := len(points) / 2 + go processChunk(0, chSplit, c, points[:split], scalars[:split]) + go processChunk(0, chSplit, c, points[split:], scalars[split:]) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } +// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { + var _p g1JacExtended + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) + } + + return p.unsafeFromJacExtended(&_p) +} + // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf // // This call return an error if len(scalars) != len(points) or if provided config is invalid. @@ -562,7 +300,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} + implementedCs := []uint64{4, 5, 8, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -607,7 +345,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time + // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 // we have nbSplits intermediate results that we must sum together. @@ -617,12 +355,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul start := i * nbPoints end := start + nbPoints go func(start, end, i int) { - msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) chDone <- i }(start, end, i) } - msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) for i := 0; i < nbSplits-1; i++ { done := <-chDone p.AddAssign(&_p[done]) @@ -631,82 +369,76 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - +func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { switch c { - case 1: - msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC3](p, 3, points, scalars, splitFirstChunk) - case 4: - msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) case 5: - msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC4](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC6](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC6](p, 7, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) case 8: - msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC6](p, 9, points, scalars, splitFirstChunk) - - case 10: - msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC4](p, 10, points, scalars, splitFirstChunk) - - case 11: - msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC10](p, 11, points, scalars, splitFirstChunk) - - case 12: - msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC12](p, 12, points, scalars, splitFirstChunk) - - case 13: - msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC7](p, 13, points, scalars, splitFirstChunk) - - case 14: - msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC6](p, 14, points, scalars, splitFirstChunk) - - case 15: - msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC9](p, 15, points, scalars, splitFirstChunk) - + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] + _innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) case 16: - msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) + processChunk := processChunkG2BatchAffine[bucketG2AffineC16] + _innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + default: + panic("not implemented") + } +} - case 17: - msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC10](p, 17, points, scalars, splitFirstChunk) +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac { - case 18: - msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC6](p, 18, points, scalars, splitFirstChunk) + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } - case 19: - msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC4](p, 19, points, scalars, splitFirstChunk) + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance - case 20: - msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC4](p, 20, points, scalars, splitFirstChunk) + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } - case 21: - msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC6](p, 21, points, scalars, splitFirstChunk) + // the last chunk may be processed with a different method than the rest, as it could be smaller. + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) - case 22: - msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC10](p, 22, points, scalars, splitFirstChunk) + for j := int(nbChunks - 2); j > 0; j-- { + go processChunk(uint64(j), chChunks[j], c, points, scalars) + } - case 23: - msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC16](p, 23, points, scalars, splitFirstChunk) + // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] + // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed + // in the ~same amount of time + if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + if !splitFirstChunk { + go processChunk(0, chChunks[0], c, points, scalars) + } else { + chSplit := make(chan g2JacExtended, 2) + split := len(points) / 2 + go processChunk(0, chSplit, c, points[:split], scalars[:split]) + go processChunk(0, chSplit, c, points[split:], scalars[split:]) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } - default: - panic("not implemented") } + + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp @@ -725,121 +457,139 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J return p.unsafeFromJacExtended(&_p) } -func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - scalars []fr.Element) { +// selector stores the index, mask and shifts needed to select bits from a scalar +// it is used during the multiExp algorithm or the batch scalar multiplication +type selector struct { + index uint64 // index in the multi-word scalar to select bits from + mask uint64 // mask (c-bit wide) + shift uint64 // shift needed to get our bits on low positions - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) + maskHigh uint64 // same than mask, for index+1 + shiftHigh uint64 // same than shift, for index+1 +} - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { + toReturn := make([]fr.Element, len(scalars)) - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) } + selectors[chunk] = d } - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int - chRes <- total + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.FitsOnOneWord() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } -} + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] -func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // init with carry if any + digit := carry + carry = 0 - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G2Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g2JacExtended, 1<<(lastC-1)) - // TODO @gbotrel last C restore. - msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars) - } + // if digit is zero, no impact on result + if digit == 0 { + continue + } - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + var bits uint64 + if digit >= 0 { + bits = uint64(digit) + } else { + bits = uint64(-digit-1) | msbWindow + } - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) + toReturn[i][s.index] |= (bits << s.shift) + if s.multiWordSelect { + toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues } diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index 72d199f31f..f012110b80 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -17,11 +17,7 @@ package bw6761 import ( - "errors" - "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" - "math" - "runtime" ) const MAX_BATCH_SIZE = 600 @@ -34,210 +30,118 @@ func (o batchOp) isNeg() bool { return o.pointID&1 == 1 } -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf +// processChunkG1BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. // -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { - var _p G1Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, + chRes chan<- g1JacExtended, + c uint64, + points []G1Affine, + scalars []fr.Element) { - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() } - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) } - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } + batch := newBatchG1Affine(&buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 + if bits == 0 { + continue } - } - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G1Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) + } + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) + } } - - msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueueG1Affine(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. } - close(chDone) - return p, nil -} - -func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 1: - msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC3](p, 3, points, scalars, splitFirstChunk) - - case 4: - msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - - case 5: - msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC4](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC6](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC6](p, 7, points, scalars, splitFirstChunk) - - case 8: - msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - case 9: - msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC6](p, 9, points, scalars, splitFirstChunk) - - case 10: - batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC4](p, 10, points, scalars, splitFirstChunk) - - case 11: - batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC10](p, 11, points, scalars, splitFirstChunk) - - case 12: - batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC12](p, 12, points, scalars, splitFirstChunk) - - case 13: - batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC7](p, 13, points, scalars, splitFirstChunk) - - case 14: - batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC6](p, 14, points, scalars, splitFirstChunk) - - case 15: - batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC9](p, 15, points, scalars, splitFirstChunk) - - case 16: - batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC10](p, 17, points, scalars, splitFirstChunk) - - case 18: - batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC6](p, 18, points, scalars, splitFirstChunk) + // flush items in batch. + batch.ExecuteAndReset() - case 19: - batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC4](p, 19, points, scalars, splitFirstChunk) + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - case 20: - batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC4](p, 20, points, scalars, splitFirstChunk) + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) + } - case 21: - batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC6](p, 21, points, scalars, splitFirstChunk) + chRes <- total - case 22: - batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC10](p, 22, points, scalars, splitFirstChunk) +} - case 23: - batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC16](p, 23, points, scalars, splitFirstChunk) +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketG1AffineC4 [1 << (4 - 1)]G1Affine +type bucketG1AffineC5 [1 << (5 - 1)]G1Affine +type bucketG1AffineC8 [1 << (8 - 1)]G1Affine +type bucketG1AffineC16 [1 << (16 - 1)]G1Affine - default: - panic("not implemented") - } +type ibG1Affine interface { + bucketG1AffineC4 | + bucketG1AffineC5 | + bucketG1AffineC8 | + bucketG1AffineC16 } type BatchG1Affine[B ibG1Affine] struct { @@ -347,10 +251,16 @@ func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B] } -func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, - chRes chan<- g1JacExtended, +// processChunkG2BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. +// +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, + chRes chan<- g2JacExtended, c uint64, - points []G1Affine, + points []G2Affine, scalars []fr.Element) { mask := uint64((1 << c) - 1) // low c bits are 1 @@ -372,7 +282,7 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, s.shiftHigh = (c - nbBitsHigh) } - batch := newBatchG1Affine(&buckets, points) + batch := newBatchG2Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 for i := 0; i < len(scalars); i++ { @@ -417,7 +327,7 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) // batch.ExecuteAndReset() for len(queue) != 0 { - queue = processQueueG1Affine(queue, &batch) + queue = processQueueG2Affine(queue, &batch) batch.ExecuteAndReset() // execute batch even if not full. } @@ -427,7 +337,7 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g1JacExtended + var runningSum, total g2JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { @@ -441,366 +351,18 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64, } -func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac { - - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g1JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G1Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g1JacExtended, 1<<(lastC-1)) - // TODO @gbotrel lastC restore. - msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } - - processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) { - msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG1Affine(p, int(c), chChunks[:]) -} - -type bucketG1AffineC1 [1 << (1 - 1)]G1Affine -type bucketG1AffineC2 [1 << (2 - 1)]G1Affine -type bucketG1AffineC3 [1 << (3 - 1)]G1Affine -type bucketG1AffineC4 [1 << (4 - 1)]G1Affine -type bucketG1AffineC5 [1 << (5 - 1)]G1Affine -type bucketG1AffineC6 [1 << (6 - 1)]G1Affine -type bucketG1AffineC7 [1 << (7 - 1)]G1Affine -type bucketG1AffineC8 [1 << (8 - 1)]G1Affine -type bucketG1AffineC9 [1 << (9 - 1)]G1Affine -type bucketG1AffineC10 [1 << (10 - 1)]G1Affine -type bucketG1AffineC11 [1 << (11 - 1)]G1Affine -type bucketG1AffineC12 [1 << (12 - 1)]G1Affine -type bucketG1AffineC13 [1 << (13 - 1)]G1Affine -type bucketG1AffineC14 [1 << (14 - 1)]G1Affine -type bucketG1AffineC15 [1 << (15 - 1)]G1Affine -type bucketG1AffineC16 [1 << (16 - 1)]G1Affine -type bucketG1AffineC17 [1 << (17 - 1)]G1Affine -type bucketG1AffineC18 [1 << (18 - 1)]G1Affine -type bucketG1AffineC19 [1 << (19 - 1)]G1Affine -type bucketG1AffineC20 [1 << (20 - 1)]G1Affine -type bucketG1AffineC21 [1 << (21 - 1)]G1Affine -type bucketG1AffineC22 [1 << (22 - 1)]G1Affine -type bucketG1AffineC23 [1 << (23 - 1)]G1Affine -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended -type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended -type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended -type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended -type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended -type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended -type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended -type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended -type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended -type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended -type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended -type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended -type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended -type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended -type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended -type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended -type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended -type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended -type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended - -type ibG1Affine interface { - bucketG1AffineC1 | - bucketG1AffineC2 | - bucketG1AffineC3 | - bucketG1AffineC4 | - bucketG1AffineC5 | - bucketG1AffineC6 | - bucketG1AffineC7 | - bucketG1AffineC8 | - bucketG1AffineC9 | - bucketG1AffineC10 | - bucketG1AffineC11 | - bucketG1AffineC12 | - bucketG1AffineC13 | - bucketG1AffineC14 | - bucketG1AffineC15 | - bucketG1AffineC16 | - bucketG1AffineC17 | - bucketG1AffineC18 | - bucketG1AffineC19 | - bucketG1AffineC20 | - bucketG1AffineC21 | - bucketG1AffineC22 | - bucketG1AffineC23 -} - -type ibg1JacExtended interface { - bucketg1JacExtendedC1 | - bucketg1JacExtendedC2 | - bucketg1JacExtendedC3 | - bucketg1JacExtendedC4 | - bucketg1JacExtendedC5 | - bucketg1JacExtendedC6 | - bucketg1JacExtendedC7 | - bucketg1JacExtendedC8 | - bucketg1JacExtendedC9 | - bucketg1JacExtendedC10 | - bucketg1JacExtendedC11 | - bucketg1JacExtendedC12 | - bucketg1JacExtendedC13 | - bucketg1JacExtendedC14 | - bucketg1JacExtendedC15 | - bucketg1JacExtendedC16 | - bucketg1JacExtendedC17 | - bucketg1JacExtendedC18 | - bucketg1JacExtendedC19 | - bucketg1JacExtendedC20 | - bucketg1JacExtendedC21 | - bucketg1JacExtendedC22 | - bucketg1JacExtendedC23 -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { - var _p G2Jac - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} - -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") - } - - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") - } - - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23} - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } - } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } - } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - - msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) - return p, nil -} - -func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - - case 1: - msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk) - - case 2: - msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk) - - case 3: - msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC3](p, 3, points, scalars, splitFirstChunk) - - case 4: - msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk) - - case 5: - msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC4](p, 5, points, scalars, splitFirstChunk) - - case 6: - msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC6](p, 6, points, scalars, splitFirstChunk) - - case 7: - msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC6](p, 7, points, scalars, splitFirstChunk) - - case 8: - msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk) - - case 9: - msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC6](p, 9, points, scalars, splitFirstChunk) - - case 10: - batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC4](p, 10, points, scalars, splitFirstChunk) - - case 11: - batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC10](p, 11, points, scalars, splitFirstChunk) - - case 12: - batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC12](p, 12, points, scalars, splitFirstChunk) - - case 13: - batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC7](p, 13, points, scalars, splitFirstChunk) - - case 14: - batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC6](p, 14, points, scalars, splitFirstChunk) - - case 15: - batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC9](p, 15, points, scalars, splitFirstChunk) - - case 16: - batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk) - - case 17: - batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC10](p, 17, points, scalars, splitFirstChunk) - - case 18: - batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC6](p, 18, points, scalars, splitFirstChunk) - - case 19: - batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC4](p, 19, points, scalars, splitFirstChunk) - - case 20: - batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC4](p, 20, points, scalars, splitFirstChunk) - - case 21: - batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC6](p, 21, points, scalars, splitFirstChunk) - - case 22: - batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC10](p, 22, points, scalars, splitFirstChunk) - - case 23: - batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC16](p, 23, points, scalars, splitFirstChunk) +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketG2AffineC4 [1 << (4 - 1)]G2Affine +type bucketG2AffineC5 [1 << (5 - 1)]G2Affine +type bucketG2AffineC8 [1 << (8 - 1)]G2Affine +type bucketG2AffineC16 [1 << (16 - 1)]G2Affine - default: - panic("not implemented") - } +type ibG2Affine interface { + bucketG2AffineC4 | + bucketG2AffineC5 | + bucketG2AffineC8 | + bucketG2AffineC16 } type BatchG2Affine[B ibG2Affine] struct { @@ -909,253 +471,3 @@ func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B] return queue } - -func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - batch := newBatchG2Affine(&buckets, points) - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. - nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - op.bucketID = uint32(bits - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) - } else { - // sub - op.bucketID = (uint32(bits & ^msbWindow)) - op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) - } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() - nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - } - } else { - // put it in queue. - queue = append(queue, op) - } - } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() - for len(queue) != 0 { - queue = processQueueG2Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. - } - - // flush items in batch. - batch.ExecuteAndReset() - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total g2JacExtended - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { - runningSum.addMixed(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - -func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac { - - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []G2Affine, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]g2JacExtended, 1<<(lastC-1)) - // TODO @gbotrel lastC restore. - msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks-1), points, scalars) - nbChunks-- - } - - processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) { - msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars) - } - - for j := int(nbChunks - 1); j > 0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) -} - -type bucketG2AffineC1 [1 << (1 - 1)]G2Affine -type bucketG2AffineC2 [1 << (2 - 1)]G2Affine -type bucketG2AffineC3 [1 << (3 - 1)]G2Affine -type bucketG2AffineC4 [1 << (4 - 1)]G2Affine -type bucketG2AffineC5 [1 << (5 - 1)]G2Affine -type bucketG2AffineC6 [1 << (6 - 1)]G2Affine -type bucketG2AffineC7 [1 << (7 - 1)]G2Affine -type bucketG2AffineC8 [1 << (8 - 1)]G2Affine -type bucketG2AffineC9 [1 << (9 - 1)]G2Affine -type bucketG2AffineC10 [1 << (10 - 1)]G2Affine -type bucketG2AffineC11 [1 << (11 - 1)]G2Affine -type bucketG2AffineC12 [1 << (12 - 1)]G2Affine -type bucketG2AffineC13 [1 << (13 - 1)]G2Affine -type bucketG2AffineC14 [1 << (14 - 1)]G2Affine -type bucketG2AffineC15 [1 << (15 - 1)]G2Affine -type bucketG2AffineC16 [1 << (16 - 1)]G2Affine -type bucketG2AffineC17 [1 << (17 - 1)]G2Affine -type bucketG2AffineC18 [1 << (18 - 1)]G2Affine -type bucketG2AffineC19 [1 << (19 - 1)]G2Affine -type bucketG2AffineC20 [1 << (20 - 1)]G2Affine -type bucketG2AffineC21 [1 << (21 - 1)]G2Affine -type bucketG2AffineC22 [1 << (22 - 1)]G2Affine -type bucketG2AffineC23 [1 << (23 - 1)]G2Affine -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended -type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended -type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended -type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended -type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended -type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended -type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended -type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended -type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended -type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended -type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended -type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended -type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended -type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended -type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended -type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended -type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended -type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended -type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended - -type ibG2Affine interface { - bucketG2AffineC1 | - bucketG2AffineC2 | - bucketG2AffineC3 | - bucketG2AffineC4 | - bucketG2AffineC5 | - bucketG2AffineC6 | - bucketG2AffineC7 | - bucketG2AffineC8 | - bucketG2AffineC9 | - bucketG2AffineC10 | - bucketG2AffineC11 | - bucketG2AffineC12 | - bucketG2AffineC13 | - bucketG2AffineC14 | - bucketG2AffineC15 | - bucketG2AffineC16 | - bucketG2AffineC17 | - bucketG2AffineC18 | - bucketG2AffineC19 | - bucketG2AffineC20 | - bucketG2AffineC21 | - bucketG2AffineC22 | - bucketG2AffineC23 -} - -type ibg2JacExtended interface { - bucketg2JacExtendedC1 | - bucketg2JacExtendedC2 | - bucketg2JacExtendedC3 | - bucketg2JacExtendedC4 | - bucketg2JacExtendedC5 | - bucketg2JacExtendedC6 | - bucketg2JacExtendedC7 | - bucketg2JacExtendedC8 | - bucketg2JacExtendedC9 | - bucketg2JacExtendedC10 | - bucketg2JacExtendedC11 | - bucketg2JacExtendedC12 | - bucketg2JacExtendedC13 | - bucketg2JacExtendedC14 | - bucketg2JacExtendedC15 | - bucketg2JacExtendedC16 | - bucketg2JacExtendedC17 | - bucketg2JacExtendedC18 | - bucketg2JacExtendedC19 | - bucketg2JacExtendedC20 | - bucketg2JacExtendedC21 | - bucketg2JacExtendedC22 | - bucketg2JacExtendedC23 -} diff --git a/ecc/bw6-761/multiexp_jacobian.go b/ecc/bw6-761/multiexp_jacobian.go new file mode 100644 index 0000000000..376ee3df28 --- /dev/null +++ b/ecc/bw6-761/multiexp_jacobian.go @@ -0,0 +1,177 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package bw6761 + +import ( + "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" +) + +func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, + chRes chan<- g1JacExtended, + c uint64, + points []G1Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g1JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended +type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended +type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended + +type ibg1JacExtended interface { + bucketg1JacExtendedC4 | + bucketg1JacExtendedC5 | + bucketg1JacExtendedC8 | + bucketg1JacExtendedC16 +} + +func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total g2JacExtended + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + +} + +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended +type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended +type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended + +type ibg2JacExtended interface { + bucketg2JacExtendedC4 | + bucketg2JacExtendedC5 | + bucketg2JacExtendedC8 | + bucketg2JacExtendedC16 +} diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index 8d851d2d42..fa82870fa4 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) + innerMsmG1(&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21} + cRange := []uint64{4, 5, 8, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -130,10 +130,10 @@ func TestMultiExpG1(t *testing.T) { results := make([]G1Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&results[i], int(c), samplePoints[:], scalars, false) + innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG1Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true) + innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -171,10 +171,10 @@ func TestMultiExpG1(t *testing.T) { results := make([]G1Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&results[i], int(c), samplePointsZero[:], scalars, false) + innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG1Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) + innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -209,8 +209,8 @@ func TestMultiExpG1(t *testing.T) { var result1, result2 G1Jac for _, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false) - msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + innerMsmG1(&result1, int(c), samplePoints[:], scalars, false) + innerMsmG1(&result2, int(c), samplePoints[:], scalars, false) if !result1.Equal(&result2) { return false } @@ -288,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) { b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { - testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) } @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true) + innerMsmG2(&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -461,10 +461,10 @@ func TestMultiExpG2(t *testing.T) { results := make([]G2Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&results[i], int(c), samplePoints[:], scalars, false) + innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG2Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true) + innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -502,10 +502,10 @@ func TestMultiExpG2(t *testing.T) { results := make([]G2Jac, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&results[i], int(c), samplePointsZero[:], scalars, false) + innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false) if c == 16 { // split the first chunk - msmInnerG2Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) + innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -540,8 +540,8 @@ func TestMultiExpG2(t *testing.T) { var result1, result2 G2Jac for _, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false) - msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false) + innerMsmG2(&result1, int(c), samplePoints[:], scalars, false) + innerMsmG2(&result2, int(c), samplePoints[:], scalars, false) if !result1.Equal(&result2) { return false } @@ -619,7 +619,7 @@ func BenchmarkMultiExpG2(b *testing.B) { b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { - testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) } diff --git a/internal/generator/config/curve.go b/internal/generator/config/curve.go index 0d387a7cf2..e1df940957 100644 --- a/internal/generator/config/curve.go +++ b/internal/generator/config/curve.go @@ -51,16 +51,16 @@ func (c Curve) Equal(other Curve) bool { } type Point struct { - CoordType string - CoordExtDegree uint8 // value n, such that q = pⁿ - CoordExtRoot int64 // value a, such that the field is Fp[X]/(Xⁿ - a) - PointName string - GLV bool // scalar multiplication using GLV - CofactorCleaning bool // flag telling if the Cofactor cleaning is available - CRange []int // multiexp bucket method: generate inner methods (with const arrays) for each c - Projective bool // generate projective coordinates - A []string //A linear coefficient in Weierstrass form - B []string //B constant term in Weierstrass form + CoordType string + CoordExtDegree uint8 // value n, such that q = pⁿ + CoordExtRoot int64 // value a, such that the field is Fp[X]/(Xⁿ - a) + PointName string + GLV bool // scalar multiplication using GLV + CofactorCleaning bool // flag telling if the Cofactor cleaning is available + CRange, LastCRange []int // multiexp bucket method: generate inner methods (with const arrays) for each c + Projective bool // generate projective coordinates + A []string //A linear coefficient in Weierstrass form + B []string //B constant term in Weierstrass form } var Curves []Curve diff --git a/internal/generator/ecc/generate.go b/internal/generator/ecc/generate.go index 6af6b7d54a..a4b3e9b5fd 100644 --- a/internal/generator/ecc/generate.go +++ b/internal/generator/ecc/generate.go @@ -17,6 +17,7 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er entries := []bavard.Entry{ {File: filepath.Join(baseDir, "multiexp.go"), Templates: []string{"multiexp.go.tmpl"}}, {File: filepath.Join(baseDir, "multiexp_affine.go"), Templates: []string{"multiexp_affine.go.tmpl"}}, + {File: filepath.Join(baseDir, "multiexp_jacobian.go"), Templates: []string{"multiexp_jacobian.go.tmpl"}}, {File: filepath.Join(baseDir, "multiexp_test.go"), Templates: []string{"tests/multiexp.go.tmpl"}}, {File: filepath.Join(baseDir, "marshal.go"), Templates: []string{"marshal.go.tmpl"}}, {File: filepath.Join(baseDir, "marshal_test.go"), Templates: []string{"tests/marshal.go.tmpl"}}, @@ -26,7 +27,7 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er funcs["last"] = func(x int, a interface{}) bool { return x == reflect.ValueOf(a).Len()-1 } - funcs["lastC"] = func(c int) int { + lastC := func(c int) int { // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) // if c divides fr.Limbs * 64; n := (conf.Fr.NbWords * 64) @@ -35,6 +36,8 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er } return n - (c * (n / c)) } + funcs["lastC"] = lastC + funcs["contains"] = func(v int, s []int) bool { for _, sv := range s { if v == sv { @@ -43,12 +46,17 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er } return false } - // TODO @gbotrel fix me. need to generate usual C, and missing lastC for bucket size. - conf.G1.CRange = make([]int, 23) - conf.G2.CRange = make([]int, 23) for i := 0; i < len(conf.G1.CRange); i++ { - conf.G1.CRange[i] = i + 1 - conf.G2.CRange[i] = i + 1 + lc := lastC(conf.G1.CRange[i]) + if !contains(conf.G1.CRange, lc) && !contains(conf.G1.LastCRange, lc) { + conf.G1.LastCRange = append(conf.G1.LastCRange, lc) + } + } + for i := 0; i < len(conf.G2.CRange); i++ { + lc := lastC(conf.G2.CRange[i]) + if !contains(conf.G2.CRange, lc) && !contains(conf.G2.LastCRange, lc) { + conf.G2.LastCRange = append(conf.G2.LastCRange, lc) + } } bavardOpts := []func(*bavard.Bavard) error{bavard.Funcs(funcs)} if err := bgen.GenerateWithOptions(conf, packageName, "./ecc/template", bavardOpts, entries...); err != nil { @@ -105,3 +113,12 @@ type pconf struct { config.Curve config.Point } + +func contains(slice []int, v int) bool { + for i := 0; i < len(slice); i++ { + if slice[i] == v { + return true + } + } + return false +} diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index 62b5f03f62..9909d8055b 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -16,6 +16,10 @@ import ( "runtime" ) +{{ template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}} +{{ template "multiexp" dict "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange}} + + // selector stores the index, mask and shifts needed to select bits from a scalar // it is used during the multiExp algorithm or the batch scalar multiplication type selector struct { @@ -157,8 +161,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } -{{ template "multiexp" dict "PointName" .G1.PointName "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}} -{{ template "multiexp" dict "PointName" .G2.PointName "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange}} {{define "multiexp" }} @@ -270,136 +272,68 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInner{{ $.TJacobian }} , but that would incur a cost of looping through all scalars one more time + // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - // we have nbSplits intermediate results that we must sum together. + // we have nbSplits intermediate results that we must sum together. _p := make([]{{ $.TJacobian }}, nbSplits - 1) chDone := make(chan int, nbSplits - 1) for i:=0; i < nbSplits-1; i++ { start := i * nbPoints - end := start + nbPoints + end := start + nbPoints go func(start, end, i int) { - msmInner{{ $.TJacobian }}(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + innerMsm{{ $.UPointName }}(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) chDone <- i }(start, end, i) } - - msmInner{{ $.TJacobian }}(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + + innerMsm{{ $.UPointName }}(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) for i:=0; i < nbSplits-1; i++ { done := <-chDone p.AddAssign(&_p[done]) } close(chDone) - return p, nil + return p, nil } -func msmInner{{ $.TJacobian }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) { +func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) { switch c { {{range $c := $.CRange}} + {{- $lc := lastC $c}} case {{$c}}: - msmC{{ $.TAffine }}[bucket{{ $.TJacobianExtended }}C{{$c}}, bucket{{ $.TJacobianExtended }}C{{lastC $c}}](p, {{$c}}, points, scalars, splitFirstChunk) - {{end}} + {{- if le $c 9}} + processChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] + {{- else}} + processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}] + {{- end}} + {{- if eq $c $lc}} + _innerMsm{{ $.UPointName }}(p, {{$c}}, points, scalars, splitFirstChunk, processChunk, processChunk) + {{- else}} + {{- if le $lc 9}} + processLastChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{lastC $c}}] + {{- else}} + processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}] + {{- end}} + _innerMsm{{ $.UPointName }}(p, {{$c}}, points, scalars, splitFirstChunk, processChunk, processLastChunk) + {{- end}} + {{- end}} default: panic("not implemented") } } -// msmReduceChunk{{ $.TAffine }} reduces the weighted sum of the buckets into the result of the multiExp -func msmReduceChunk{{ $.TAffine }}(p *{{ $.TJacobian }}, c int, chChunks []chan {{ $.TJacobianExtended }}) *{{ $.TJacobian }} { - var _p {{ $.TJacobianExtended }} - totalj := <-chChunks[len(chChunks)-1] - _p.Set(&totalj) - for j := len(chChunks) - 2; j >= 0; j-- { - for l := 0; l < c; l++ { - _p.double(&_p) - } - totalj := <-chChunks[j] - _p.add(&totalj) - } - - return p.unsafeFromJacExtended(&_p) -} - - -func msmProcessChunk{{ $.TAffine }}[B ib{{ $.TJacobianExtended }}](chunk uint64, - chRes chan<- {{ $.TJacobianExtended }}, - c uint64, - points []{{ $.TAffine }}, - scalars []fr.Element) { - - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c -1)) - - var buckets B - for i := 0 ; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64 %c)!=0 && s.shift > (64-c) && s.index < (fr.Limbs - 1 ) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - // if msbWindow bit is set, we need to substract - if bits & msbWindow == 0 { - // add - buckets[bits-1].addMixed(&points[i]) - } else { - // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) - } - } - - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total {{ $.TJacobianExtended }} - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].ZZ.IsZero() { - runningSum.add(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - {{/* close(chRes) */}} -} - - - -func msmC{{ $.TAffine }}[B ib{{ $.TJacobianExtended }}, LB ib{{ $.TJacobianExtended }}](p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) *{{ $.TJacobian }} { +func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element)) *{{ $.TJacobian }} { + nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { nbChunks++ } - // for each chunk, spawn one go routine that'll loop through all the scalars in the + + // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // note that buckets is an array allocated on the stack (for most sizes of c) and this is // critical for performance // each go routine sends its result in chChunks[i] channel @@ -408,48 +342,58 @@ func msmC{{ $.TAffine }}[B ib{{ $.TJacobianExtended }}, LB ib{{ $.TJacobianExten chChunks[i] = make(chan {{ $.TJacobianExtended }}, 1) } + // the last chunk may be processed with a different method than the rest, as it could be smaller. + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + for j := int(nbChunks - 2); j >0; j-- { + go processChunk(uint64(j), chChunks[j], c, points, scalars) + } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []{{ $.TAffine }}, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]{{ $.TJacobianExtended }}, 1<<(lastC-1)) - // TODO @gbotrel last C restore. - msmProcessChunk{{ $.TAffine }}[LB](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks - 1), points, scalars) - nbChunks-- + // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] + // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed + // in the ~same amount of time + if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + if !splitFirstChunk { + go processChunk(0,chChunks[0], c, points, scalars) + } else { + chSplit := make(chan {{ $.TJacobianExtended }}, 2) + split := len(points) / 2 + go processChunk(0,chSplit, c, points[:split], scalars[:split]) + go processChunk(0,chSplit, c, points[split:], scalars[split:]) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[0] <- s1 + }() + } + } + + return msmReduceChunk{{ $.TAffine }}(p, int(c), chChunks[:]) +} - processChunk := func(j int, points []{{ $.TAffine }}, scalars []fr.Element, chChunk chan {{ $.TJacobianExtended }}) { - msmProcessChunk{{ $.TAffine }}[B](uint64(j), chChunk, c, points, scalars) - } - for j := int(nbChunks - 1); j >0; j-- { - go processChunk(j, points, scalars, chChunks[j]) +// msmReduceChunk{{ $.TAffine }} reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunk{{ $.TAffine }}(p *{{ $.TJacobian }}, c int, chChunks []chan {{ $.TJacobianExtended }}) *{{ $.TJacobian }} { + var _p {{ $.TJacobianExtended }} + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.double(&_p) + } + totalj := <-chChunks[j] + _p.add(&totalj) } - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan {{ $.TJacobianExtended }}, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } + return p.unsafeFromJacExtended(&_p) +} + + - return msmReduceChunk{{ $.TAffine }}(p, int(c), chChunks[:]) -} {{end }} diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index 3a803280f3..0c0fba41e5 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -8,10 +8,6 @@ import ( "github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr" - "github.com/consensys/gnark-crypto/ecc" - "errors" - "math" - "runtime" ) const MAX_BATCH_SIZE = 600 @@ -26,158 +22,124 @@ func (o batchOp) isNeg() bool { -{{ template "multiexp" dict "PointName" .G1.PointName "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}} -{{ template "multiexp" dict "PointName" .G2.PointName "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}} +{{ template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange "LastCRange" .G1.LastCRange}} +{{ template "multiexp" dict "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange "LastCRange" .G2.LastCRange}} {{define "multiexp" }} -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *{{ $.TAffine }}) MultiExpBatchAffine(points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig) (*{{ $.TAffine }}, error) { - var _p {{$.TJacobian}} - if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil { - return nil, err - } - p.FromJacobian(&_p) - return p, nil -} +// processChunk{{ $.UPointName }}BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. +// +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64, + chRes chan<- {{ $.TJacobianExtended }}, + c uint64, + points []{{ $.TAffine }}, + scalars []fr.Element) { -// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf -// -// This call return an error if len(scalars) != len(points) or if provided config is invalid. -func (p *{{ $.TJacobian }}) MultiExpBatchAffine(points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig) (*{{ $.TJacobian }}, error) { - // note: - // each of the batchAffineMsmCX method is the same, except for the c constant it declares - // duplicating (through template generation) these methods allows to declare the buckets on the stack - // the choice of c needs to be improved: - // there is a theoritical value that gives optimal asymptotics - // but in practice, other factors come into play, including: - // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 - // * number of CPUs - // * cache friendliness (which depends on the host, G1 or G2... ) - // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. - - // for each batchAffineMsmCX - // step 1 - // we compute, for each scalars over c-bit wide windows, nbChunk digits - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract - // 2^{c} to the current digit, making it negative. - // negative digits will be processed in the next step as adding -G into the bucket instead of G - // (computing -G is cheap, and this saves us half of the buckets) - // step 2 - // buckets are declared on the stack - // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) - // we use jacobian extended formulas here as they are faster than mixed addition - // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel - // step 3 - // reduce the buckets weigthed sums into our result (msmReduceChunk) - - // ensure len(points) == len(scalars) - nbPoints := len(points) - if nbPoints != len(scalars) { - return nil, errors.New("len(points) != len(scalars)") + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() } - // if nbTasks is not set, use all available CPUs - if config.NbTasks <= 0 { - config.NbTasks = runtime.NumCPU() - } else if config.NbTasks > 1024 { - return nil, errors.New("invalid config: config.NbTasks > 1024") + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64 %c)!=0 && s.shift > (64-c) && s.index < (fr.Limbs - 1 ) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) } - // here, we compute the best C for nbPoints - // we split recursively until nbChunks(c) >= nbTasks, - bestC := func(nbPoints int) uint64 { - // implemented batchAffineMsmC methods (the c we use must be in this slice) - implementedCs := []uint64{ - {{- range $c := $.CRange}} {{- if and (eq $.PointName "g1") (gt $c 21)}}{{- else}} {{$c}},{{- end}}{{- end}} + batch := newBatch{{ $.TAffine }}(&buckets, points) + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + nbBatches := 0 + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh } - var C uint64 - // approximate cost (in group operations) - // cost = bits/c * (nbPoints + 2^{c}) - // this needs to be verified empirically. - // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results - min := math.MaxFloat64 - for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) - cost := float64(cc) / float64(c) - if cost < min { - min = cost - C = c - } + + if bits == 0 { + continue } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } - return C - } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs * 64) % C != 0 { - nbChunks ++ + op := batchOp{pointID: uint32(i) << 1} + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + op.bucketID = uint32(bits - 1) + // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + } else { + // sub + op.bucketID = (uint32(bits & ^msbWindow)) + op.pointID += 1 + // op.isNeg = true + // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 + if batch.CanAdd(op.bucketID) { + batch.Add(op) + if batch.IsFull() { + batch.ExecuteAndReset() + nbBatches++ + if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + batch.Add(queue[len(queue)-1]) + queue = queue[:len(queue)-1] + } + } + } else { + // put it in queue. + queue = append(queue, op) } } - - // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in msmInner{{ $.TJacobian }}BatchAffine , but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - // we have nbSplits intermediate results that we must sum together. - _p := make([]{{ $.TJacobian }}, nbSplits - 1) - chDone := make(chan int, nbSplits - 1) - for i:=0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - msmInner{{ $.TJacobian }}BatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", + // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // batch.ExecuteAndReset() + for len(queue) != 0 { + queue = processQueue{{ $.TAffine }}(queue, &batch) + batch.ExecuteAndReset() // execute batch even if not full. } - msmInner{{ $.TJacobian }}BatchAffine(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - for i:=0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) + // flush items in batch. + batch.ExecuteAndReset() + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total {{ $.TJacobianExtended }} + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].IsInfinity() { + runningSum.addMixed(&buckets[k]) + } + total.add(&runningSum) } - close(chDone) - return p, nil + + chRes <- total + } -func msmInner{{ $.TJacobian }}BatchAffine(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) { - - switch c { - {{range $c := $.CRange}} - case {{$c}}: - {{- if le $c 9}} - msmC{{ $.TAffine }}[bucket{{ $.TJacobianExtended }}C{{$c}}, bucket{{ $.TJacobianExtended }}C{{lastC $c}}](p, {{$c}}, points, scalars, splitFirstChunk) - {{- else}} - batch{{ $.TAffine }}Msm[bucket{{ $.TAffine }}C{{$c}}, bucket{{ $.TJacobianExtended }}C{{lastC $c}}](p, {{$c}}, points, scalars, splitFirstChunk) - {{- end}} - {{end}} - default: - panic("not implemented") - } +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +{{- range $c := $.CRange}} +type bucket{{ $.TAffine }}C{{$c}} [1<<({{$c}}-1)]{{ $.TAffine }} +{{- end}} + +type ib{{ $.TAffine }} interface { + {{- range $i, $c := $.CRange}} + bucket{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}} + {{- end}} } @@ -288,180 +250,4 @@ func processQueue{{ $.TAffine }}[B ib{{ $.TAffine }}](queue []batchOp, batch *Ba } -func msmProcessChunk{{ $.TAffine }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64, - chRes chan<- {{ $.TJacobianExtended }}, - c uint64, - points []{{ $.TAffine }}, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() - } - - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64 %c)!=0 && s.shift > (64-c) && s.index < (fr.Limbs - 1 ) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - batch := newBatch{{ $.TAffine }}(&buckets, points) - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. - nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } - - if bits == 0 { - continue - } - - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { - // add - op.bucketID = uint32(bits - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) - } else { - // sub - op.bucketID = (uint32(bits & ^msbWindow)) - op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) - } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() - nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - } - } else { - // put it in queue. - queue = append(queue, op) - } - } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() - for len(queue) != 0 { - queue = processQueue{{ $.TAffine }}(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. - } - - // flush items in batch. - batch.ExecuteAndReset() - - // reduce buckets into total - // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - - var runningSum, total {{ $.TJacobianExtended }} - runningSum.setInfinity() - total.setInfinity() - for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { - runningSum.addMixed(&buckets[k]) - } - total.add(&runningSum) - } - - chRes <- total - -} - - - -func batch{{ $.TAffine }}Msm[B ib{{ $.TAffine }}, J ib{{ $.TJacobianExtended }}](p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) *{{ $.TJacobian }} { - - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { - nbChunks++ - } - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan {{ $.TJacobianExtended }}, nbChunks) - for i:=0; i < len(chChunks);i++ { - chChunks[i] = make(chan {{ $.TJacobianExtended }}, 1) - } - - if (fr.Limbs*64)%c != 0 { - // TODO @gbotrel not always needed to do ext jac here. - go func(j uint64, points []{{ $.TAffine }}, scalars []fr.Element) { - // var buckets LB - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // buckets := make([]{{ $.TJacobianExtended }}, 1<<(lastC-1)) - // TODO @gbotrel lastC restore. - msmProcessChunk{{ $.TAffine }}[J](j, chChunks[j], c, points, scalars) - }(uint64(nbChunks - 1), points, scalars) - nbChunks-- - } - - - processChunk := func(j int, points []{{ $.TAffine }}, scalars []fr.Element, chChunk chan {{ $.TJacobianExtended }}) { - msmProcessChunk{{ $.TAffine }}BatchAffine[B](uint64(j), chChunk, c, points, scalars) - } - - for j := int(nbChunks - 1); j >0; j-- { - go processChunk(j, points, scalars, chChunks[j]) - } - - if !splitFirstChunk { - go processChunk(0, points, scalars, chChunks[0]) - } else { - chSplit := make(chan {{ $.TJacobianExtended }}, 2) - split := len(points) / 2 - go processChunk(0, points[:split], scalars[:split], chSplit) - go processChunk(0, points[split:], scalars[split:], chSplit) - go func() { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[0] <- s1 - }() - } - - return msmReduceChunk{{ $.TAffine }}(p, int(c), chChunks[:]) -} - - - - -{{- range $c := $.CRange}} -type bucket{{ $.TAffine }}C{{$c}} [1<<({{$c}}-1)]{{ $.TAffine }} -{{- end}} -{{- range $c := $.CRange}} -type bucket{{ $.TJacobianExtended }}C{{$c}} [1<<({{$c}}-1)]{{ $.TJacobianExtended }} -{{- end}} - -type ib{{ $.TAffine }} interface { - {{- range $i, $c := $.CRange}} - bucket{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}} - {{- end}} -} - -type ib{{ $.TJacobianExtended }} interface { - {{- range $i, $c := $.CRange}} - bucket{{ $.TJacobianExtended }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}} - {{- end}} -} - {{end }} diff --git a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl new file mode 100644 index 0000000000..72217dba82 --- /dev/null +++ b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl @@ -0,0 +1,106 @@ +{{ $G1TAffine := print (toUpper .G1.PointName) "Affine" }} +{{ $G1TJacobian := print (toUpper .G1.PointName) "Jac" }} +{{ $G1TJacobianExtended := print (toLower .G1.PointName) "JacExtended" }} + +{{ $G2TAffine := print (toUpper .G2.PointName) "Affine" }} +{{ $G2TJacobian := print (toUpper .G2.PointName) "Jac" }} +{{ $G2TJacobianExtended := print (toLower .G2.PointName) "JacExtended" }} + +import ( + "github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr" +) + + +{{ template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange "LastCRange" .G1.LastCRange}} +{{ template "multiexp" dict "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange "LastCRange" .G2.LastCRange}} + + + +{{define "multiexp" }} + +func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk uint64, + chRes chan<- {{ $.TJacobianExtended }}, + c uint64, + points []{{ $.TAffine }}, + scalars []fr.Element) { + + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c -1)) + + var buckets B + for i := 0 ; i < len(buckets); i++ { + buckets[i].setInfinity() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64 %c)!=0 && s.shift > (64-c) && s.index < (fr.Limbs - 1 ) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + // if msbWindow bit is set, we need to substract + if bits & msbWindow == 0 { + // add + buckets[bits-1].addMixed(&points[i]) + } else { + // sub + buckets[bits & ^msbWindow].subMixed(&points[i]) + } + } + + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var runningSum, total {{ $.TJacobianExtended }} + runningSum.setInfinity() + total.setInfinity() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].ZZ.IsZero() { + runningSum.add(&buckets[k]) + } + total.add(&runningSum) + } + + chRes <- total + {{/* close(chRes) */}} +} + +// we declare the buckets as fixed-size array types +// this allow us to allocate the buckets on the stack +{{- range $c := $.CRange}} +type bucket{{ $.TJacobianExtended }}C{{$c}} [1<<({{$c}}-1)]{{ $.TJacobianExtended }} +{{- end}} +{{- range $c := $.LastCRange}} +type bucket{{ $.TJacobianExtended }}C{{$c}} [1<<({{$c}}-1)]{{ $.TJacobianExtended }} +{{- end}} + +type ib{{ $.TJacobianExtended }} interface { + {{- range $i, $c := $.LastCRange}} + bucket{{ $.TJacobianExtended }}C{{$c}} | + {{- end}} + {{- range $i, $c := $.CRange}} + bucket{{ $.TJacobianExtended }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}} + {{- end}} +} + +{{end }} diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index 93e26a09d6..ef51368fbc 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -91,7 +91,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - msmC{{ $.TAffine }}[bucket{{ $.TJacobianExtended }}C16, bucket{{ $.TJacobianExtended }}C{{lastC 16}}](&r16, 16, samplePoints[:], scalars16, true) + innerMsm{{ toUpper $.PointName }}(&r16, 16, samplePoints[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -138,10 +138,10 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { results := make([]{{ $.TJacobian }}, len(cRange) + 1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInner{{ $.TJacobian }}(&results[i], int(c), samplePoints[:], scalars, false) + innerMsm{{ toUpper $.PointName }}(&results[i], int(c), samplePoints[:], scalars, false) if c == 16 { // split the first chunk - msmInner{{ $.TJacobian }}(&results[len(results)-1], 16, samplePoints[:], scalars, true) + innerMsm{{ toUpper $.PointName }}(&results[len(results)-1], 16, samplePoints[:], scalars, true) } } for i:=1; i < len(results);i++ { @@ -179,10 +179,10 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { results := make([]{{ $.TJacobian }}, len(cRange)+1) for i, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInner{{ $.TJacobian }}(&results[i], int(c), samplePointsZero[:], scalars, false) + innerMsm{{ toUpper $.PointName }}(&results[i], int(c), samplePointsZero[:], scalars, false) if c == 16 { // split the first chunk - msmInner{{ $.TJacobian }}(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) + innerMsm{{ toUpper $.PointName }}(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) } } for i := 1; i < len(results); i++ { @@ -218,8 +218,8 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { var result1, result2 {{ $.TJacobian }} for _, c := range cRange { scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - msmInner{{ $.TJacobian }}(&result1, int(c), samplePoints[:], scalars, false) - msmInner{{ $.TJacobian }}BatchAffine(&result2, int(c), samplePoints[:], scalars, false) + innerMsm{{ toUpper $.PointName }}(&result1, int(c), samplePoints[:], scalars, false) + innerMsm{{ toUpper $.PointName }}(&result2, int(c), samplePoints[:], scalars, false) if !result1.Equal(&result2) { return false } @@ -300,7 +300,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { - testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using],ecc.MultiExpConfig{}) + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using],ecc.MultiExpConfig{}) } }) } From 16352ccdef68a4528ba531448e48da2891662c69 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 8 Nov 2022 14:42:08 -0600 Subject: [PATCH 05/43] docs: added a todo in tmpl --- ecc/bls12-377/multiexp_jacobian.go | 2 -- ecc/bls12-378/multiexp_jacobian.go | 2 -- ecc/bls12-381/multiexp_jacobian.go | 2 -- ecc/bls24-315/multiexp_jacobian.go | 2 -- ecc/bls24-317/multiexp_jacobian.go | 2 -- ecc/bn254/multiexp_jacobian.go | 2 -- ecc/bw6-633/multiexp_jacobian.go | 2 -- ecc/bw6-756/multiexp_jacobian.go | 2 -- ecc/bw6-761/multiexp_jacobian.go | 2 -- internal/generator/ecc/template/multiexp.go.tmpl | 2 ++ internal/generator/ecc/template/multiexp_jacobian.go.tmpl | 1 - 11 files changed, 2 insertions(+), 19 deletions(-) diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go index fc89ebd2cc..dc787cb52e 100644 --- a/ecc/bls12-377/multiexp_jacobian.go +++ b/ecc/bls12-377/multiexp_jacobian.go @@ -81,7 +81,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, } chRes <- total - } // we declare the buckets as fixed-size array types @@ -185,7 +184,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, } chRes <- total - } // we declare the buckets as fixed-size array types diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go index a26fe93845..33efbc4286 100644 --- a/ecc/bls12-378/multiexp_jacobian.go +++ b/ecc/bls12-378/multiexp_jacobian.go @@ -81,7 +81,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, } chRes <- total - } // we declare the buckets as fixed-size array types @@ -185,7 +184,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, } chRes <- total - } // we declare the buckets as fixed-size array types diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go index a4e61348b7..b62f2d6012 100644 --- a/ecc/bls12-381/multiexp_jacobian.go +++ b/ecc/bls12-381/multiexp_jacobian.go @@ -81,7 +81,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, } chRes <- total - } // we declare the buckets as fixed-size array types @@ -185,7 +184,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, } chRes <- total - } // we declare the buckets as fixed-size array types diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go index 4399395829..f4a2abe4b6 100644 --- a/ecc/bls24-315/multiexp_jacobian.go +++ b/ecc/bls24-315/multiexp_jacobian.go @@ -81,7 +81,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, } chRes <- total - } // we declare the buckets as fixed-size array types @@ -185,7 +184,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, } chRes <- total - } // we declare the buckets as fixed-size array types diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go index d948e2c697..b928244181 100644 --- a/ecc/bls24-317/multiexp_jacobian.go +++ b/ecc/bls24-317/multiexp_jacobian.go @@ -81,7 +81,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, } chRes <- total - } // we declare the buckets as fixed-size array types @@ -185,7 +184,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, } chRes <- total - } // we declare the buckets as fixed-size array types diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go index 4939af44c8..5434d0e1aa 100644 --- a/ecc/bn254/multiexp_jacobian.go +++ b/ecc/bn254/multiexp_jacobian.go @@ -81,7 +81,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, } chRes <- total - } // we declare the buckets as fixed-size array types @@ -185,7 +184,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, } chRes <- total - } // we declare the buckets as fixed-size array types diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go index f331d07491..8ed7343862 100644 --- a/ecc/bw6-633/multiexp_jacobian.go +++ b/ecc/bw6-633/multiexp_jacobian.go @@ -81,7 +81,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, } chRes <- total - } // we declare the buckets as fixed-size array types @@ -159,7 +158,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, } chRes <- total - } // we declare the buckets as fixed-size array types diff --git a/ecc/bw6-756/multiexp_jacobian.go b/ecc/bw6-756/multiexp_jacobian.go index 9dc8862130..984264456b 100644 --- a/ecc/bw6-756/multiexp_jacobian.go +++ b/ecc/bw6-756/multiexp_jacobian.go @@ -81,7 +81,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, } chRes <- total - } // we declare the buckets as fixed-size array types @@ -159,7 +158,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, } chRes <- total - } // we declare the buckets as fixed-size array types diff --git a/ecc/bw6-761/multiexp_jacobian.go b/ecc/bw6-761/multiexp_jacobian.go index 376ee3df28..6e2acf4b41 100644 --- a/ecc/bw6-761/multiexp_jacobian.go +++ b/ecc/bw6-761/multiexp_jacobian.go @@ -81,7 +81,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, } chRes <- total - } // we declare the buckets as fixed-size array types @@ -159,7 +158,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, } chRes <- total - } // we declare the buckets as fixed-size array types diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index 9909d8055b..6b5b935d81 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -298,6 +298,8 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) { + {{- /* TODO @gbotrel need to deal with cases where lastC == 1 ; having a whole chunk with 1-bit window makes no sense */}} + {{- /* also need to determine until which window size the ext-jacobian version is worth it. */}} switch c { {{range $c := $.CRange}} {{- $lc := lastC $c}} diff --git a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl index 72217dba82..d4e00fa442 100644 --- a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl +++ b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl @@ -82,7 +82,6 @@ func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk } chRes <- total - {{/* close(chRes) */}} } // we declare the buckets as fixed-size array types From 95e4305ce5c393fd05489e0f022ed2947d42b980 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 8 Nov 2022 15:18:41 -0600 Subject: [PATCH 06/43] feat: partitionScalars return list of digits unpacked --- ecc/bls12-377/g1.go | 2 +- ecc/bls12-377/g2.go | 2 +- ecc/bls12-377/multiexp.go | 343 ++++++++++++------ ecc/bls12-377/multiexp_affine.go | 62 +--- ecc/bls12-377/multiexp_jacobian.go | 64 +--- ecc/bls12-377/multiexp_test.go | 4 +- ecc/bls12-378/g1.go | 2 +- ecc/bls12-378/g2.go | 2 +- ecc/bls12-378/multiexp.go | 343 ++++++++++++------ ecc/bls12-378/multiexp_affine.go | 62 +--- ecc/bls12-378/multiexp_jacobian.go | 64 +--- ecc/bls12-378/multiexp_test.go | 4 +- ecc/bls12-381/g1.go | 2 +- ecc/bls12-381/g2.go | 2 +- ecc/bls12-381/multiexp.go | 343 ++++++++++++------ ecc/bls12-381/multiexp_affine.go | 62 +--- ecc/bls12-381/multiexp_jacobian.go | 64 +--- ecc/bls12-381/multiexp_test.go | 4 +- ecc/bls24-315/g1.go | 2 +- ecc/bls24-315/g2.go | 2 +- ecc/bls24-315/multiexp.go | 343 ++++++++++++------ ecc/bls24-315/multiexp_affine.go | 62 +--- ecc/bls24-315/multiexp_jacobian.go | 64 +--- ecc/bls24-315/multiexp_test.go | 4 +- ecc/bls24-317/g1.go | 2 +- ecc/bls24-317/g2.go | 2 +- ecc/bls24-317/multiexp.go | 343 ++++++++++++------ ecc/bls24-317/multiexp_affine.go | 62 +--- ecc/bls24-317/multiexp_jacobian.go | 64 +--- ecc/bls24-317/multiexp_test.go | 4 +- ecc/bn254/g1.go | 2 +- ecc/bn254/g2.go | 2 +- ecc/bn254/multiexp.go | 343 ++++++++++++------ ecc/bn254/multiexp_affine.go | 62 +--- ecc/bn254/multiexp_jacobian.go | 64 +--- ecc/bn254/multiexp_test.go | 4 +- ecc/bw6-633/g1.go | 2 +- ecc/bw6-633/g2.go | 2 +- ecc/bw6-633/multiexp.go | 299 ++++++++++----- ecc/bw6-633/multiexp_affine.go | 62 +--- ecc/bw6-633/multiexp_jacobian.go | 64 +--- ecc/bw6-633/multiexp_test.go | 4 +- ecc/bw6-756/g1.go | 2 +- ecc/bw6-756/g2.go | 2 +- ecc/bw6-756/multiexp.go | 299 ++++++++++----- ecc/bw6-756/multiexp_affine.go | 62 +--- ecc/bw6-756/multiexp_jacobian.go | 64 +--- ecc/bw6-756/multiexp_test.go | 4 +- ecc/bw6-761/g1.go | 2 +- ecc/bw6-761/g2.go | 2 +- ecc/bw6-761/multiexp.go | 299 ++++++++++----- ecc/bw6-761/multiexp_affine.go | 62 +--- ecc/bw6-761/multiexp_jacobian.go | 64 +--- ecc/bw6-761/multiexp_test.go | 4 +- .../generator/ecc/template/multiexp.go.tmpl | 217 ++++++++--- .../ecc/template/multiexp_affine.go.tmpl | 32 +- .../ecc/template/multiexp_jacobian.go.tmpl | 33 +- internal/generator/ecc/template/point.go.tmpl | 2 +- .../ecc/template/tests/multiexp.go.tmpl | 2 +- 59 files changed, 2421 insertions(+), 2026 deletions(-) diff --git a/ecc/bls12-377/g1.go b/ecc/bls12-377/g1.go index bc9027480a..3b436a6b2b 100644 --- a/ecc/bls12-377/g1.go +++ b/ecc/bls12-377/g1.go @@ -915,7 +915,7 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin baseTable[i].AddMixed(base) } - pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU()) + pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU()) // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) diff --git a/ecc/bls12-377/g2.go b/ecc/bls12-377/g2.go index fdf535ca82..18810fe510 100644 --- a/ecc/bls12-377/g2.go +++ b/ecc/bls12-377/g2.go @@ -914,7 +914,7 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin baseTable[i].AddMixed(base) } - pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU()) + pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU()) // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index 1673861355..4fad52e512 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -106,120 +106,113 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return C } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } + // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. + // nbSplits := 1 + C := bestC(nbPoints) + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ } // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + // var smallValues int + pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - + innerMsmG1(p, int(C), points, pscalars, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. - _p := make([]G1Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) + // _p := make([]G1Jac, nbSplits - 1) + // chDone := make(chan int, nbSplits - 1) + // for i:=0; i < nbSplits-1; i++ { + // start := i * nbPoints + // end := start + nbPoints + // go func(start, end, i int) { + // innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + // chDone <- i + // }(start, end, i) + // } + + // innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + // for i:=0; i < nbSplits-1; i++ { + // done := <-chDone + // p.AddAssign(&_p[done]) + // } + // close(chDone) return p, nil } -func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { +func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG1BatchAffine[bucketG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - _innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG1BatchAffine[bucketG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG1BatchAffine[bucketG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG1BatchAffine[bucketG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - _innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG1BatchAffine[bucketG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG1BatchAffine[bucketG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG1BatchAffine[bucketG1AffineC20] processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG1BatchAffine[bucketG1AffineC21] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -238,10 +231,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp } // the last chunk may be processed with a different method than the rest, as it could be smaller. - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + n := len(points) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, scalars) + go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -249,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, scalars) + go processChunk(0, chChunks[0], c, points, pscalars[:n]) } else { chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, chSplit, c, points[:split], scalars[:split]) - go processChunk(0, chSplit, c, points[split:], scalars[split:]) + split := n / 2 + go processChunk(0, chSplit, c, points[:split], pscalars[:split]) + go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -366,120 +360,113 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return C } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } + // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. + // nbSplits := 1 + C := bestC(nbPoints) + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ } // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + // var smallValues int + pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - + innerMsmG2(p, int(C), points, pscalars, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) + // _p := make([]G2Jac, nbSplits - 1) + // chDone := make(chan int, nbSplits - 1) + // for i:=0; i < nbSplits-1; i++ { + // start := i * nbPoints + // end := start + nbPoints + // go func(start, end, i int) { + // innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + // chDone <- i + // }(start, end, i) + // } + + // innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + // for i:=0; i < nbSplits-1; i++ { + // done := <-chDone + // p.AddAssign(&_p[done]) + // } + // close(chDone) return p, nil } -func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { +func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG2BatchAffine[bucketG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - _innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG2BatchAffine[bucketG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG2BatchAffine[bucketG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG2BatchAffine[bucketG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - _innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG2BatchAffine[bucketG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG2BatchAffine[bucketG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG2BatchAffine[bucketG2AffineC20] processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG2BatchAffine[bucketG2AffineC21] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -498,10 +485,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp } // the last chunk may be processed with a different method than the rest, as it could be smaller. - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + n := len(points) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, scalars) + go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -509,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, scalars) + go processChunk(0, chChunks[0], c, points, pscalars[:n]) } else { chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, chSplit, c, points[:split], scalars[:split]) - go processChunk(0, chSplit, c, points[split:], scalars[split:]) + split := n / 2 + go processChunk(0, chSplit, c, points[:split], pscalars[:split]) + go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -565,7 +553,132 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } + + toReturn := make([]uint32, len(scalars)*int(nbChunks)) + + mask := uint64((1 << c) - 1) // low c bits are 1 + // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) + } + selectors[chunk] = d + } + + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) + + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int + + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.FitsOnOneWord() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } + + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] + + // init with carry if any + digit := carry + carry = 0 + + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + + // if digit is zero, no impact on result + if digit == 0 { + continue + } + + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } + + var bits uint32 + if digit >= 0 { + bits = uint32(digit) << 1 + } else { + bits = (uint32(-digit-1) << 1) + 1 + } + toReturn[int(chunk)*len(scalars)+i] = bits + // [s.index] |= (bits << s.shift) + // if s.multiWordSelect { + // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + // } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues +} + +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { toReturn := make([]fr.Element, len(scalars)) // number of c-bit radixes in a scalar diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index ac8d41cbb6..c3f89f7406 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -16,10 +16,6 @@ package bls12377 -import ( - "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" -) - const MAX_BATCH_SIZE = 600 type batchOp struct { @@ -40,35 +36,18 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - batch := newBatchG1Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue @@ -76,13 +55,13 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - op.bucketID = uint32(bits - 1) + op.bucketID = uint32((bits >> 1) - 1) // buckets[bits-1].Add(&points[i], &buckets[bits-1]) } else { // sub - op.bucketID = (uint32(bits & ^msbWindow)) + op.bucketID = (uint32((bits >> 1))) op.pointID += 1 // op.isNeg = true // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) @@ -283,35 +262,18 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - batch := newBatchG2Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue @@ -319,13 +281,13 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - op.bucketID = uint32(bits - 1) + op.bucketID = uint32((bits >> 1) - 1) // buckets[bits-1].Add(&points[i], &buckets[bits-1]) } else { // sub - op.bucketID = (uint32(bits & ^msbWindow)) + op.bucketID = (uint32((bits >> 1))) op.pointID += 1 // op.isNeg = true // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go index dc787cb52e..1981509ee0 100644 --- a/ecc/bls12-377/multiexp_jacobian.go +++ b/ecc/bls12-377/multiexp_jacobian.go @@ -16,54 +16,32 @@ package bls12377 -import ( - "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" -) - func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + pscalars []uint32) { var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - buckets[bits-1].addMixed(&points[i]) + buckets[(bits>>1)-1].addMixed(&points[i]) } else { // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + buckets[(bits >> 1)].subMixed(&points[i]) } } @@ -127,46 +105,28 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + pscalars []uint32) { var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - buckets[bits-1].addMixed(&points[i]) + buckets[(bits>>1)-1].addMixed(&points[i]) } else { // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + buckets[(bits >> 1)].subMixed(&points[i]) } } diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index 7882874fda..fe84ecc91f 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG1(&r16, 16, samplePoints[:], scalars16, true) + innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG2(&r16, 16, samplePoints[:], scalars16, true) + innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) diff --git a/ecc/bls12-378/g1.go b/ecc/bls12-378/g1.go index fd8fbe7ee0..5b9ec0f84f 100644 --- a/ecc/bls12-378/g1.go +++ b/ecc/bls12-378/g1.go @@ -915,7 +915,7 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin baseTable[i].AddMixed(base) } - pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU()) + pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU()) // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) diff --git a/ecc/bls12-378/g2.go b/ecc/bls12-378/g2.go index 479cda7053..0010b3983b 100644 --- a/ecc/bls12-378/g2.go +++ b/ecc/bls12-378/g2.go @@ -914,7 +914,7 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin baseTable[i].AddMixed(base) } - pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU()) + pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU()) // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index 862cca829b..200a5fc096 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -106,120 +106,113 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return C } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } + // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. + // nbSplits := 1 + C := bestC(nbPoints) + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ } // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + // var smallValues int + pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - + innerMsmG1(p, int(C), points, pscalars, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. - _p := make([]G1Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) + // _p := make([]G1Jac, nbSplits - 1) + // chDone := make(chan int, nbSplits - 1) + // for i:=0; i < nbSplits-1; i++ { + // start := i * nbPoints + // end := start + nbPoints + // go func(start, end, i int) { + // innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + // chDone <- i + // }(start, end, i) + // } + + // innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + // for i:=0; i < nbSplits-1; i++ { + // done := <-chDone + // p.AddAssign(&_p[done]) + // } + // close(chDone) return p, nil } -func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { +func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG1BatchAffine[bucketG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - _innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG1BatchAffine[bucketG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG1BatchAffine[bucketG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG1BatchAffine[bucketG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - _innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG1BatchAffine[bucketG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG1BatchAffine[bucketG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG1BatchAffine[bucketG1AffineC20] processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG1BatchAffine[bucketG1AffineC21] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -238,10 +231,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp } // the last chunk may be processed with a different method than the rest, as it could be smaller. - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + n := len(points) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, scalars) + go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -249,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, scalars) + go processChunk(0, chChunks[0], c, points, pscalars[:n]) } else { chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, chSplit, c, points[:split], scalars[:split]) - go processChunk(0, chSplit, c, points[split:], scalars[split:]) + split := n / 2 + go processChunk(0, chSplit, c, points[:split], pscalars[:split]) + go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -366,120 +360,113 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return C } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } + // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. + // nbSplits := 1 + C := bestC(nbPoints) + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ } // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + // var smallValues int + pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - + innerMsmG2(p, int(C), points, pscalars, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) + // _p := make([]G2Jac, nbSplits - 1) + // chDone := make(chan int, nbSplits - 1) + // for i:=0; i < nbSplits-1; i++ { + // start := i * nbPoints + // end := start + nbPoints + // go func(start, end, i int) { + // innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + // chDone <- i + // }(start, end, i) + // } + + // innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + // for i:=0; i < nbSplits-1; i++ { + // done := <-chDone + // p.AddAssign(&_p[done]) + // } + // close(chDone) return p, nil } -func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { +func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG2BatchAffine[bucketG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - _innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG2BatchAffine[bucketG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG2BatchAffine[bucketG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG2BatchAffine[bucketG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - _innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG2BatchAffine[bucketG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG2BatchAffine[bucketG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG2BatchAffine[bucketG2AffineC20] processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG2BatchAffine[bucketG2AffineC21] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -498,10 +485,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp } // the last chunk may be processed with a different method than the rest, as it could be smaller. - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + n := len(points) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, scalars) + go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -509,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, scalars) + go processChunk(0, chChunks[0], c, points, pscalars[:n]) } else { chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, chSplit, c, points[:split], scalars[:split]) - go processChunk(0, chSplit, c, points[split:], scalars[split:]) + split := n / 2 + go processChunk(0, chSplit, c, points[:split], pscalars[:split]) + go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -565,7 +553,132 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } + + toReturn := make([]uint32, len(scalars)*int(nbChunks)) + + mask := uint64((1 << c) - 1) // low c bits are 1 + // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) + } + selectors[chunk] = d + } + + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) + + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int + + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.FitsOnOneWord() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } + + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] + + // init with carry if any + digit := carry + carry = 0 + + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + + // if digit is zero, no impact on result + if digit == 0 { + continue + } + + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } + + var bits uint32 + if digit >= 0 { + bits = uint32(digit) << 1 + } else { + bits = (uint32(-digit-1) << 1) + 1 + } + toReturn[int(chunk)*len(scalars)+i] = bits + // [s.index] |= (bits << s.shift) + // if s.multiWordSelect { + // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + // } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues +} + +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { toReturn := make([]fr.Element, len(scalars)) // number of c-bit radixes in a scalar diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index 583761fe76..06c5f74bfe 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -16,10 +16,6 @@ package bls12378 -import ( - "github.com/consensys/gnark-crypto/ecc/bls12-378/fr" -) - const MAX_BATCH_SIZE = 600 type batchOp struct { @@ -40,35 +36,18 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - batch := newBatchG1Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue @@ -76,13 +55,13 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - op.bucketID = uint32(bits - 1) + op.bucketID = uint32((bits >> 1) - 1) // buckets[bits-1].Add(&points[i], &buckets[bits-1]) } else { // sub - op.bucketID = (uint32(bits & ^msbWindow)) + op.bucketID = (uint32((bits >> 1))) op.pointID += 1 // op.isNeg = true // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) @@ -283,35 +262,18 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - batch := newBatchG2Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue @@ -319,13 +281,13 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - op.bucketID = uint32(bits - 1) + op.bucketID = uint32((bits >> 1) - 1) // buckets[bits-1].Add(&points[i], &buckets[bits-1]) } else { // sub - op.bucketID = (uint32(bits & ^msbWindow)) + op.bucketID = (uint32((bits >> 1))) op.pointID += 1 // op.isNeg = true // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go index 33efbc4286..592070f11d 100644 --- a/ecc/bls12-378/multiexp_jacobian.go +++ b/ecc/bls12-378/multiexp_jacobian.go @@ -16,54 +16,32 @@ package bls12378 -import ( - "github.com/consensys/gnark-crypto/ecc/bls12-378/fr" -) - func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + pscalars []uint32) { var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - buckets[bits-1].addMixed(&points[i]) + buckets[(bits>>1)-1].addMixed(&points[i]) } else { // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + buckets[(bits >> 1)].subMixed(&points[i]) } } @@ -127,46 +105,28 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + pscalars []uint32) { var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - buckets[bits-1].addMixed(&points[i]) + buckets[(bits>>1)-1].addMixed(&points[i]) } else { // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + buckets[(bits >> 1)].subMixed(&points[i]) } } diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index 8a80c9d1f8..a94fc2e0b9 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG1(&r16, 16, samplePoints[:], scalars16, true) + innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG2(&r16, 16, samplePoints[:], scalars16, true) + innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) diff --git a/ecc/bls12-381/g1.go b/ecc/bls12-381/g1.go index 189c5ac202..eccf0c9c97 100644 --- a/ecc/bls12-381/g1.go +++ b/ecc/bls12-381/g1.go @@ -915,7 +915,7 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin baseTable[i].AddMixed(base) } - pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU()) + pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU()) // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) diff --git a/ecc/bls12-381/g2.go b/ecc/bls12-381/g2.go index 3473f3d002..5264766d99 100644 --- a/ecc/bls12-381/g2.go +++ b/ecc/bls12-381/g2.go @@ -915,7 +915,7 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin baseTable[i].AddMixed(base) } - pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU()) + pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU()) // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index d926dc8e2e..50b716e3c4 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -106,120 +106,113 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return C } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } + // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. + // nbSplits := 1 + C := bestC(nbPoints) + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ } // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + // var smallValues int + pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - + innerMsmG1(p, int(C), points, pscalars, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. - _p := make([]G1Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) + // _p := make([]G1Jac, nbSplits - 1) + // chDone := make(chan int, nbSplits - 1) + // for i:=0; i < nbSplits-1; i++ { + // start := i * nbPoints + // end := start + nbPoints + // go func(start, end, i int) { + // innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + // chDone <- i + // }(start, end, i) + // } + + // innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + // for i:=0; i < nbSplits-1; i++ { + // done := <-chDone + // p.AddAssign(&_p[done]) + // } + // close(chDone) return p, nil } -func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { +func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG1BatchAffine[bucketG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - _innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG1BatchAffine[bucketG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG1BatchAffine[bucketG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG1BatchAffine[bucketG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - _innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG1BatchAffine[bucketG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG1BatchAffine[bucketG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG1BatchAffine[bucketG1AffineC20] processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG1BatchAffine[bucketG1AffineC21] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -238,10 +231,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp } // the last chunk may be processed with a different method than the rest, as it could be smaller. - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + n := len(points) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, scalars) + go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -249,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, scalars) + go processChunk(0, chChunks[0], c, points, pscalars[:n]) } else { chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, chSplit, c, points[:split], scalars[:split]) - go processChunk(0, chSplit, c, points[split:], scalars[split:]) + split := n / 2 + go processChunk(0, chSplit, c, points[:split], pscalars[:split]) + go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -366,120 +360,113 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return C } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } + // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. + // nbSplits := 1 + C := bestC(nbPoints) + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ } // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + // var smallValues int + pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - + innerMsmG2(p, int(C), points, pscalars, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) + // _p := make([]G2Jac, nbSplits - 1) + // chDone := make(chan int, nbSplits - 1) + // for i:=0; i < nbSplits-1; i++ { + // start := i * nbPoints + // end := start + nbPoints + // go func(start, end, i int) { + // innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + // chDone <- i + // }(start, end, i) + // } + + // innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + // for i:=0; i < nbSplits-1; i++ { + // done := <-chDone + // p.AddAssign(&_p[done]) + // } + // close(chDone) return p, nil } -func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { +func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG2BatchAffine[bucketG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - _innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG2BatchAffine[bucketG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG2BatchAffine[bucketG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG2BatchAffine[bucketG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - _innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG2BatchAffine[bucketG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG2BatchAffine[bucketG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG2BatchAffine[bucketG2AffineC20] processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG2BatchAffine[bucketG2AffineC21] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -498,10 +485,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp } // the last chunk may be processed with a different method than the rest, as it could be smaller. - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + n := len(points) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, scalars) + go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -509,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, scalars) + go processChunk(0, chChunks[0], c, points, pscalars[:n]) } else { chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, chSplit, c, points[:split], scalars[:split]) - go processChunk(0, chSplit, c, points[split:], scalars[split:]) + split := n / 2 + go processChunk(0, chSplit, c, points[:split], pscalars[:split]) + go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -565,7 +553,132 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } + + toReturn := make([]uint32, len(scalars)*int(nbChunks)) + + mask := uint64((1 << c) - 1) // low c bits are 1 + // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) + } + selectors[chunk] = d + } + + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) + + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int + + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.FitsOnOneWord() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } + + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] + + // init with carry if any + digit := carry + carry = 0 + + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + + // if digit is zero, no impact on result + if digit == 0 { + continue + } + + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } + + var bits uint32 + if digit >= 0 { + bits = uint32(digit) << 1 + } else { + bits = (uint32(-digit-1) << 1) + 1 + } + toReturn[int(chunk)*len(scalars)+i] = bits + // [s.index] |= (bits << s.shift) + // if s.multiWordSelect { + // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + // } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues +} + +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { toReturn := make([]fr.Element, len(scalars)) // number of c-bit radixes in a scalar diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index 36695009a0..3f0eeabba3 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -16,10 +16,6 @@ package bls12381 -import ( - "github.com/consensys/gnark-crypto/ecc/bls12-381/fr" -) - const MAX_BATCH_SIZE = 600 type batchOp struct { @@ -40,35 +36,18 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - batch := newBatchG1Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue @@ -76,13 +55,13 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - op.bucketID = uint32(bits - 1) + op.bucketID = uint32((bits >> 1) - 1) // buckets[bits-1].Add(&points[i], &buckets[bits-1]) } else { // sub - op.bucketID = (uint32(bits & ^msbWindow)) + op.bucketID = (uint32((bits >> 1))) op.pointID += 1 // op.isNeg = true // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) @@ -283,35 +262,18 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - batch := newBatchG2Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue @@ -319,13 +281,13 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - op.bucketID = uint32(bits - 1) + op.bucketID = uint32((bits >> 1) - 1) // buckets[bits-1].Add(&points[i], &buckets[bits-1]) } else { // sub - op.bucketID = (uint32(bits & ^msbWindow)) + op.bucketID = (uint32((bits >> 1))) op.pointID += 1 // op.isNeg = true // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go index b62f2d6012..3840228907 100644 --- a/ecc/bls12-381/multiexp_jacobian.go +++ b/ecc/bls12-381/multiexp_jacobian.go @@ -16,54 +16,32 @@ package bls12381 -import ( - "github.com/consensys/gnark-crypto/ecc/bls12-381/fr" -) - func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + pscalars []uint32) { var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - buckets[bits-1].addMixed(&points[i]) + buckets[(bits>>1)-1].addMixed(&points[i]) } else { // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + buckets[(bits >> 1)].subMixed(&points[i]) } } @@ -127,46 +105,28 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + pscalars []uint32) { var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - buckets[bits-1].addMixed(&points[i]) + buckets[(bits>>1)-1].addMixed(&points[i]) } else { // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + buckets[(bits >> 1)].subMixed(&points[i]) } } diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index 15cd0f5304..8e356fde9d 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG1(&r16, 16, samplePoints[:], scalars16, true) + innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG2(&r16, 16, samplePoints[:], scalars16, true) + innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) diff --git a/ecc/bls24-315/g1.go b/ecc/bls24-315/g1.go index 3209c210ad..173d24e902 100644 --- a/ecc/bls24-315/g1.go +++ b/ecc/bls24-315/g1.go @@ -917,7 +917,7 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin baseTable[i].AddMixed(base) } - pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU()) + pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU()) // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) diff --git a/ecc/bls24-315/g2.go b/ecc/bls24-315/g2.go index 7f377b8147..e498978c0b 100644 --- a/ecc/bls24-315/g2.go +++ b/ecc/bls24-315/g2.go @@ -930,7 +930,7 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin baseTable[i].AddMixed(base) } - pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU()) + pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU()) // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index 3686207518..912d69ba44 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -106,120 +106,113 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return C } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } + // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. + // nbSplits := 1 + C := bestC(nbPoints) + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ } // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + // var smallValues int + pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - + innerMsmG1(p, int(C), points, pscalars, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. - _p := make([]G1Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) + // _p := make([]G1Jac, nbSplits - 1) + // chDone := make(chan int, nbSplits - 1) + // for i:=0; i < nbSplits-1; i++ { + // start := i * nbPoints + // end := start + nbPoints + // go func(start, end, i int) { + // innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + // chDone <- i + // }(start, end, i) + // } + + // innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + // for i:=0; i < nbSplits-1; i++ { + // done := <-chDone + // p.AddAssign(&_p[done]) + // } + // close(chDone) return p, nil } -func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { +func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG1BatchAffine[bucketG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - _innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG1BatchAffine[bucketG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG1BatchAffine[bucketG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG1BatchAffine[bucketG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - _innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG1BatchAffine[bucketG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG1BatchAffine[bucketG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG1BatchAffine[bucketG1AffineC20] processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG1BatchAffine[bucketG1AffineC21] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -238,10 +231,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp } // the last chunk may be processed with a different method than the rest, as it could be smaller. - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + n := len(points) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, scalars) + go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -249,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, scalars) + go processChunk(0, chChunks[0], c, points, pscalars[:n]) } else { chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, chSplit, c, points[:split], scalars[:split]) - go processChunk(0, chSplit, c, points[split:], scalars[split:]) + split := n / 2 + go processChunk(0, chSplit, c, points[:split], pscalars[:split]) + go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -366,120 +360,113 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return C } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } + // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. + // nbSplits := 1 + C := bestC(nbPoints) + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ } // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + // var smallValues int + pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - + innerMsmG2(p, int(C), points, pscalars, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) + // _p := make([]G2Jac, nbSplits - 1) + // chDone := make(chan int, nbSplits - 1) + // for i:=0; i < nbSplits-1; i++ { + // start := i * nbPoints + // end := start + nbPoints + // go func(start, end, i int) { + // innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + // chDone <- i + // }(start, end, i) + // } + + // innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + // for i:=0; i < nbSplits-1; i++ { + // done := <-chDone + // p.AddAssign(&_p[done]) + // } + // close(chDone) return p, nil } -func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { +func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG2BatchAffine[bucketG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - _innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG2BatchAffine[bucketG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG2BatchAffine[bucketG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG2BatchAffine[bucketG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - _innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG2BatchAffine[bucketG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG2BatchAffine[bucketG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG2BatchAffine[bucketG2AffineC20] processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG2BatchAffine[bucketG2AffineC21] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -498,10 +485,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp } // the last chunk may be processed with a different method than the rest, as it could be smaller. - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + n := len(points) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, scalars) + go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -509,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, scalars) + go processChunk(0, chChunks[0], c, points, pscalars[:n]) } else { chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, chSplit, c, points[:split], scalars[:split]) - go processChunk(0, chSplit, c, points[split:], scalars[split:]) + split := n / 2 + go processChunk(0, chSplit, c, points[:split], pscalars[:split]) + go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -565,7 +553,132 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } + + toReturn := make([]uint32, len(scalars)*int(nbChunks)) + + mask := uint64((1 << c) - 1) // low c bits are 1 + // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) + } + selectors[chunk] = d + } + + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) + + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int + + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.FitsOnOneWord() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } + + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] + + // init with carry if any + digit := carry + carry = 0 + + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + + // if digit is zero, no impact on result + if digit == 0 { + continue + } + + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } + + var bits uint32 + if digit >= 0 { + bits = uint32(digit) << 1 + } else { + bits = (uint32(-digit-1) << 1) + 1 + } + toReturn[int(chunk)*len(scalars)+i] = bits + // [s.index] |= (bits << s.shift) + // if s.multiWordSelect { + // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + // } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues +} + +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { toReturn := make([]fr.Element, len(scalars)) // number of c-bit radixes in a scalar diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index f1cdcfe574..93b8a89cdb 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -16,10 +16,6 @@ package bls24315 -import ( - "github.com/consensys/gnark-crypto/ecc/bls24-315/fr" -) - const MAX_BATCH_SIZE = 600 type batchOp struct { @@ -40,35 +36,18 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - batch := newBatchG1Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue @@ -76,13 +55,13 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - op.bucketID = uint32(bits - 1) + op.bucketID = uint32((bits >> 1) - 1) // buckets[bits-1].Add(&points[i], &buckets[bits-1]) } else { // sub - op.bucketID = (uint32(bits & ^msbWindow)) + op.bucketID = (uint32((bits >> 1))) op.pointID += 1 // op.isNeg = true // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) @@ -283,35 +262,18 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - batch := newBatchG2Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue @@ -319,13 +281,13 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - op.bucketID = uint32(bits - 1) + op.bucketID = uint32((bits >> 1) - 1) // buckets[bits-1].Add(&points[i], &buckets[bits-1]) } else { // sub - op.bucketID = (uint32(bits & ^msbWindow)) + op.bucketID = (uint32((bits >> 1))) op.pointID += 1 // op.isNeg = true // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go index f4a2abe4b6..be1b9a8b65 100644 --- a/ecc/bls24-315/multiexp_jacobian.go +++ b/ecc/bls24-315/multiexp_jacobian.go @@ -16,54 +16,32 @@ package bls24315 -import ( - "github.com/consensys/gnark-crypto/ecc/bls24-315/fr" -) - func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + pscalars []uint32) { var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - buckets[bits-1].addMixed(&points[i]) + buckets[(bits>>1)-1].addMixed(&points[i]) } else { // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + buckets[(bits >> 1)].subMixed(&points[i]) } } @@ -127,46 +105,28 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + pscalars []uint32) { var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - buckets[bits-1].addMixed(&points[i]) + buckets[(bits>>1)-1].addMixed(&points[i]) } else { // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + buckets[(bits >> 1)].subMixed(&points[i]) } } diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index 6a17d03fb4..352e8122b6 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG1(&r16, 16, samplePoints[:], scalars16, true) + innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG2(&r16, 16, samplePoints[:], scalars16, true) + innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) diff --git a/ecc/bls24-317/g1.go b/ecc/bls24-317/g1.go index a7198ef2ea..9443125d34 100644 --- a/ecc/bls24-317/g1.go +++ b/ecc/bls24-317/g1.go @@ -917,7 +917,7 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin baseTable[i].AddMixed(base) } - pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU()) + pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU()) // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) diff --git a/ecc/bls24-317/g2.go b/ecc/bls24-317/g2.go index 907c1db13b..0e2738e211 100644 --- a/ecc/bls24-317/g2.go +++ b/ecc/bls24-317/g2.go @@ -930,7 +930,7 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin baseTable[i].AddMixed(base) } - pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU()) + pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU()) // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index 2cc4feb7fd..f2c3d767a1 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -106,120 +106,113 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return C } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } + // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. + // nbSplits := 1 + C := bestC(nbPoints) + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ } // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + // var smallValues int + pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - + innerMsmG1(p, int(C), points, pscalars, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. - _p := make([]G1Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) + // _p := make([]G1Jac, nbSplits - 1) + // chDone := make(chan int, nbSplits - 1) + // for i:=0; i < nbSplits-1; i++ { + // start := i * nbPoints + // end := start + nbPoints + // go func(start, end, i int) { + // innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + // chDone <- i + // }(start, end, i) + // } + + // innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + // for i:=0; i < nbSplits-1; i++ { + // done := <-chDone + // p.AddAssign(&_p[done]) + // } + // close(chDone) return p, nil } -func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { +func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG1BatchAffine[bucketG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - _innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG1BatchAffine[bucketG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG1BatchAffine[bucketG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG1BatchAffine[bucketG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - _innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG1BatchAffine[bucketG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG1BatchAffine[bucketG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG1BatchAffine[bucketG1AffineC20] processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG1BatchAffine[bucketG1AffineC21] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -238,10 +231,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp } // the last chunk may be processed with a different method than the rest, as it could be smaller. - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + n := len(points) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, scalars) + go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -249,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, scalars) + go processChunk(0, chChunks[0], c, points, pscalars[:n]) } else { chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, chSplit, c, points[:split], scalars[:split]) - go processChunk(0, chSplit, c, points[split:], scalars[split:]) + split := n / 2 + go processChunk(0, chSplit, c, points[:split], pscalars[:split]) + go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -366,120 +360,113 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return C } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } + // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. + // nbSplits := 1 + C := bestC(nbPoints) + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ } // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + // var smallValues int + pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - + innerMsmG2(p, int(C), points, pscalars, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) + // _p := make([]G2Jac, nbSplits - 1) + // chDone := make(chan int, nbSplits - 1) + // for i:=0; i < nbSplits-1; i++ { + // start := i * nbPoints + // end := start + nbPoints + // go func(start, end, i int) { + // innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + // chDone <- i + // }(start, end, i) + // } + + // innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + // for i:=0; i < nbSplits-1; i++ { + // done := <-chDone + // p.AddAssign(&_p[done]) + // } + // close(chDone) return p, nil } -func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { +func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG2BatchAffine[bucketG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - _innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG2BatchAffine[bucketG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG2BatchAffine[bucketG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG2BatchAffine[bucketG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - _innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG2BatchAffine[bucketG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG2BatchAffine[bucketG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG2BatchAffine[bucketG2AffineC20] processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG2BatchAffine[bucketG2AffineC21] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -498,10 +485,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp } // the last chunk may be processed with a different method than the rest, as it could be smaller. - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + n := len(points) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, scalars) + go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -509,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, scalars) + go processChunk(0, chChunks[0], c, points, pscalars[:n]) } else { chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, chSplit, c, points[:split], scalars[:split]) - go processChunk(0, chSplit, c, points[split:], scalars[split:]) + split := n / 2 + go processChunk(0, chSplit, c, points[:split], pscalars[:split]) + go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -565,7 +553,132 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } + + toReturn := make([]uint32, len(scalars)*int(nbChunks)) + + mask := uint64((1 << c) - 1) // low c bits are 1 + // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) + } + selectors[chunk] = d + } + + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) + + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int + + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.FitsOnOneWord() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } + + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] + + // init with carry if any + digit := carry + carry = 0 + + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + + // if digit is zero, no impact on result + if digit == 0 { + continue + } + + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } + + var bits uint32 + if digit >= 0 { + bits = uint32(digit) << 1 + } else { + bits = (uint32(-digit-1) << 1) + 1 + } + toReturn[int(chunk)*len(scalars)+i] = bits + // [s.index] |= (bits << s.shift) + // if s.multiWordSelect { + // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + // } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues +} + +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { toReturn := make([]fr.Element, len(scalars)) // number of c-bit radixes in a scalar diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index 916d8beced..2f19119cc6 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -16,10 +16,6 @@ package bls24317 -import ( - "github.com/consensys/gnark-crypto/ecc/bls24-317/fr" -) - const MAX_BATCH_SIZE = 600 type batchOp struct { @@ -40,35 +36,18 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - batch := newBatchG1Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue @@ -76,13 +55,13 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - op.bucketID = uint32(bits - 1) + op.bucketID = uint32((bits >> 1) - 1) // buckets[bits-1].Add(&points[i], &buckets[bits-1]) } else { // sub - op.bucketID = (uint32(bits & ^msbWindow)) + op.bucketID = (uint32((bits >> 1))) op.pointID += 1 // op.isNeg = true // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) @@ -283,35 +262,18 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - batch := newBatchG2Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue @@ -319,13 +281,13 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - op.bucketID = uint32(bits - 1) + op.bucketID = uint32((bits >> 1) - 1) // buckets[bits-1].Add(&points[i], &buckets[bits-1]) } else { // sub - op.bucketID = (uint32(bits & ^msbWindow)) + op.bucketID = (uint32((bits >> 1))) op.pointID += 1 // op.isNeg = true // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go index b928244181..a79434b537 100644 --- a/ecc/bls24-317/multiexp_jacobian.go +++ b/ecc/bls24-317/multiexp_jacobian.go @@ -16,54 +16,32 @@ package bls24317 -import ( - "github.com/consensys/gnark-crypto/ecc/bls24-317/fr" -) - func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + pscalars []uint32) { var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - buckets[bits-1].addMixed(&points[i]) + buckets[(bits>>1)-1].addMixed(&points[i]) } else { // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + buckets[(bits >> 1)].subMixed(&points[i]) } } @@ -127,46 +105,28 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + pscalars []uint32) { var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - buckets[bits-1].addMixed(&points[i]) + buckets[(bits>>1)-1].addMixed(&points[i]) } else { // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + buckets[(bits >> 1)].subMixed(&points[i]) } } diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index 7e39930e23..feb0b0c87f 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG1(&r16, 16, samplePoints[:], scalars16, true) + innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG2(&r16, 16, samplePoints[:], scalars16, true) + innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) diff --git a/ecc/bn254/g1.go b/ecc/bn254/g1.go index 4056491a53..0844716e0f 100644 --- a/ecc/bn254/g1.go +++ b/ecc/bn254/g1.go @@ -887,7 +887,7 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin baseTable[i].AddMixed(base) } - pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU()) + pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU()) // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) diff --git a/ecc/bn254/g2.go b/ecc/bn254/g2.go index deeb006578..23203fd92c 100644 --- a/ecc/bn254/g2.go +++ b/ecc/bn254/g2.go @@ -919,7 +919,7 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin baseTable[i].AddMixed(base) } - pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU()) + pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU()) // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index e519895eb5..19b6fd8496 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -106,120 +106,113 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return C } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } + // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. + // nbSplits := 1 + C := bestC(nbPoints) + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ } // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + // var smallValues int + pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - + innerMsmG1(p, int(C), points, pscalars, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. - _p := make([]G1Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) + // _p := make([]G1Jac, nbSplits - 1) + // chDone := make(chan int, nbSplits - 1) + // for i:=0; i < nbSplits-1; i++ { + // start := i * nbPoints + // end := start + nbPoints + // go func(start, end, i int) { + // innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + // chDone <- i + // }(start, end, i) + // } + + // innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + // for i:=0; i < nbSplits-1; i++ { + // done := <-chDone + // p.AddAssign(&_p[done]) + // } + // close(chDone) return p, nil } -func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { +func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG1BatchAffine[bucketG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - _innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG1BatchAffine[bucketG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG1BatchAffine[bucketG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG1BatchAffine[bucketG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - _innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG1BatchAffine[bucketG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG1BatchAffine[bucketG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG1BatchAffine[bucketG1AffineC20] processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG1BatchAffine[bucketG1AffineC21] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -238,10 +231,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp } // the last chunk may be processed with a different method than the rest, as it could be smaller. - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + n := len(points) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, scalars) + go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -249,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, scalars) + go processChunk(0, chChunks[0], c, points, pscalars[:n]) } else { chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, chSplit, c, points[:split], scalars[:split]) - go processChunk(0, chSplit, c, points[split:], scalars[split:]) + split := n / 2 + go processChunk(0, chSplit, c, points[:split], pscalars[:split]) + go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -366,120 +360,113 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return C } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } + // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. + // nbSplits := 1 + C := bestC(nbPoints) + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ } // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + // var smallValues int + pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - + innerMsmG2(p, int(C), points, pscalars, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) + // _p := make([]G2Jac, nbSplits - 1) + // chDone := make(chan int, nbSplits - 1) + // for i:=0; i < nbSplits-1; i++ { + // start := i * nbPoints + // end := start + nbPoints + // go func(start, end, i int) { + // innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + // chDone <- i + // }(start, end, i) + // } + + // innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + // for i:=0; i < nbSplits-1; i++ { + // done := <-chDone + // p.AddAssign(&_p[done]) + // } + // close(chDone) return p, nil } -func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { +func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG2BatchAffine[bucketG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - _innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG2BatchAffine[bucketG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG2BatchAffine[bucketG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG2BatchAffine[bucketG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - _innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG2BatchAffine[bucketG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG2BatchAffine[bucketG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG2BatchAffine[bucketG2AffineC20] processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG2BatchAffine[bucketG2AffineC21] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -498,10 +485,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp } // the last chunk may be processed with a different method than the rest, as it could be smaller. - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + n := len(points) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, scalars) + go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -509,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, scalars) + go processChunk(0, chChunks[0], c, points, pscalars[:n]) } else { chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, chSplit, c, points[:split], scalars[:split]) - go processChunk(0, chSplit, c, points[split:], scalars[split:]) + split := n / 2 + go processChunk(0, chSplit, c, points[:split], pscalars[:split]) + go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -565,7 +553,132 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } + + toReturn := make([]uint32, len(scalars)*int(nbChunks)) + + mask := uint64((1 << c) - 1) // low c bits are 1 + // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) + } + selectors[chunk] = d + } + + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) + + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int + + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.FitsOnOneWord() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } + + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] + + // init with carry if any + digit := carry + carry = 0 + + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + + // if digit is zero, no impact on result + if digit == 0 { + continue + } + + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } + + var bits uint32 + if digit >= 0 { + bits = uint32(digit) << 1 + } else { + bits = (uint32(-digit-1) << 1) + 1 + } + toReturn[int(chunk)*len(scalars)+i] = bits + // [s.index] |= (bits << s.shift) + // if s.multiWordSelect { + // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + // } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues +} + +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { toReturn := make([]fr.Element, len(scalars)) // number of c-bit radixes in a scalar diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index 8b10c9786f..0028bf14dd 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -16,10 +16,6 @@ package bn254 -import ( - "github.com/consensys/gnark-crypto/ecc/bn254/fr" -) - const MAX_BATCH_SIZE = 600 type batchOp struct { @@ -40,35 +36,18 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - batch := newBatchG1Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue @@ -76,13 +55,13 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - op.bucketID = uint32(bits - 1) + op.bucketID = uint32((bits >> 1) - 1) // buckets[bits-1].Add(&points[i], &buckets[bits-1]) } else { // sub - op.bucketID = (uint32(bits & ^msbWindow)) + op.bucketID = (uint32((bits >> 1))) op.pointID += 1 // op.isNeg = true // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) @@ -283,35 +262,18 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - batch := newBatchG2Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue @@ -319,13 +281,13 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - op.bucketID = uint32(bits - 1) + op.bucketID = uint32((bits >> 1) - 1) // buckets[bits-1].Add(&points[i], &buckets[bits-1]) } else { // sub - op.bucketID = (uint32(bits & ^msbWindow)) + op.bucketID = (uint32((bits >> 1))) op.pointID += 1 // op.isNeg = true // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go index 5434d0e1aa..2b0db816d2 100644 --- a/ecc/bn254/multiexp_jacobian.go +++ b/ecc/bn254/multiexp_jacobian.go @@ -16,54 +16,32 @@ package bn254 -import ( - "github.com/consensys/gnark-crypto/ecc/bn254/fr" -) - func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + pscalars []uint32) { var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - buckets[bits-1].addMixed(&points[i]) + buckets[(bits>>1)-1].addMixed(&points[i]) } else { // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + buckets[(bits >> 1)].subMixed(&points[i]) } } @@ -127,46 +105,28 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + pscalars []uint32) { var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - buckets[bits-1].addMixed(&points[i]) + buckets[(bits>>1)-1].addMixed(&points[i]) } else { // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + buckets[(bits >> 1)].subMixed(&points[i]) } } diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index 7fbb203ce1..a9df7af3fa 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG1(&r16, 16, samplePoints[:], scalars16, true) + innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG2(&r16, 16, samplePoints[:], scalars16, true) + innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) diff --git a/ecc/bw6-633/g1.go b/ecc/bw6-633/g1.go index f70d2b30cc..860ce3e355 100644 --- a/ecc/bw6-633/g1.go +++ b/ecc/bw6-633/g1.go @@ -1019,7 +1019,7 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin baseTable[i].AddMixed(base) } - pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU()) + pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU()) // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) diff --git a/ecc/bw6-633/g2.go b/ecc/bw6-633/g2.go index f9284d2ec7..12579994ab 100644 --- a/ecc/bw6-633/g2.go +++ b/ecc/bw6-633/g2.go @@ -885,7 +885,7 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin baseTable[i].AddMixed(base) } - pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU()) + pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU()) // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index ceb1ad7847..9c168de3c5 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -106,75 +106,68 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return C } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } + // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. + // nbSplits := 1 + C := bestC(nbPoints) + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ } // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + // var smallValues int + pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - + innerMsmG1(p, int(C), points, pscalars, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. - _p := make([]G1Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) + // _p := make([]G1Jac, nbSplits - 1) + // chDone := make(chan int, nbSplits - 1) + // for i:=0; i < nbSplits-1; i++ { + // start := i * nbPoints + // end := start + nbPoints + // go func(start, end, i int) { + // innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + // chDone <- i + // }(start, end, i) + // } + + // innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + // for i:=0; i < nbSplits-1; i++ { + // done := <-chDone + // p.AddAssign(&_p[done]) + // } + // close(chDone) return p, nil } -func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { +func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] - _innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -193,10 +186,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp } // the last chunk may be processed with a different method than the rest, as it could be smaller. - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + n := len(points) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, scalars) + go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -204,12 +198,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, scalars) + go processChunk(0, chChunks[0], c, points, pscalars[:n]) } else { chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, chSplit, c, points[:split], scalars[:split]) - go processChunk(0, chSplit, c, points[split:], scalars[split:]) + split := n / 2 + go processChunk(0, chSplit, c, points[:split], pscalars[:split]) + go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -321,75 +315,68 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return C } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } + // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. + // nbSplits := 1 + C := bestC(nbPoints) + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ } // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + // var smallValues int + pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - + innerMsmG2(p, int(C), points, pscalars, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) + // _p := make([]G2Jac, nbSplits - 1) + // chDone := make(chan int, nbSplits - 1) + // for i:=0; i < nbSplits-1; i++ { + // start := i * nbPoints + // end := start + nbPoints + // go func(start, end, i int) { + // innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + // chDone <- i + // }(start, end, i) + // } + + // innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + // for i:=0; i < nbSplits-1; i++ { + // done := <-chDone + // p.AddAssign(&_p[done]) + // } + // close(chDone) return p, nil } -func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { +func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] - _innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -408,10 +395,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp } // the last chunk may be processed with a different method than the rest, as it could be smaller. - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + n := len(points) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, scalars) + go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -419,12 +407,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, scalars) + go processChunk(0, chChunks[0], c, points, pscalars[:n]) } else { chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, chSplit, c, points[:split], scalars[:split]) - go processChunk(0, chSplit, c, points[split:], scalars[split:]) + split := n / 2 + go processChunk(0, chSplit, c, points[:split], pscalars[:split]) + go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -475,7 +463,132 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } + + toReturn := make([]uint32, len(scalars)*int(nbChunks)) + + mask := uint64((1 << c) - 1) // low c bits are 1 + // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) + } + selectors[chunk] = d + } + + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) + + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int + + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.FitsOnOneWord() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } + + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] + + // init with carry if any + digit := carry + carry = 0 + + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + + // if digit is zero, no impact on result + if digit == 0 { + continue + } + + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } + + var bits uint32 + if digit >= 0 { + bits = uint32(digit) << 1 + } else { + bits = (uint32(-digit-1) << 1) + 1 + } + toReturn[int(chunk)*len(scalars)+i] = bits + // [s.index] |= (bits << s.shift) + // if s.multiWordSelect { + // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + // } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues +} + +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { toReturn := make([]fr.Element, len(scalars)) // number of c-bit radixes in a scalar diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index 221079f874..1cc44fcaa6 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -16,10 +16,6 @@ package bw6633 -import ( - "github.com/consensys/gnark-crypto/ecc/bw6-633/fr" -) - const MAX_BATCH_SIZE = 600 type batchOp struct { @@ -40,35 +36,18 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - batch := newBatchG1Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue @@ -76,13 +55,13 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - op.bucketID = uint32(bits - 1) + op.bucketID = uint32((bits >> 1) - 1) // buckets[bits-1].Add(&points[i], &buckets[bits-1]) } else { // sub - op.bucketID = (uint32(bits & ^msbWindow)) + op.bucketID = (uint32((bits >> 1))) op.pointID += 1 // op.isNeg = true // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) @@ -261,35 +240,18 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - batch := newBatchG2Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue @@ -297,13 +259,13 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - op.bucketID = uint32(bits - 1) + op.bucketID = uint32((bits >> 1) - 1) // buckets[bits-1].Add(&points[i], &buckets[bits-1]) } else { // sub - op.bucketID = (uint32(bits & ^msbWindow)) + op.bucketID = (uint32((bits >> 1))) op.pointID += 1 // op.isNeg = true // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go index 8ed7343862..996352830f 100644 --- a/ecc/bw6-633/multiexp_jacobian.go +++ b/ecc/bw6-633/multiexp_jacobian.go @@ -16,54 +16,32 @@ package bw6633 -import ( - "github.com/consensys/gnark-crypto/ecc/bw6-633/fr" -) - func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + pscalars []uint32) { var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - buckets[bits-1].addMixed(&points[i]) + buckets[(bits>>1)-1].addMixed(&points[i]) } else { // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + buckets[(bits >> 1)].subMixed(&points[i]) } } @@ -101,46 +79,28 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + pscalars []uint32) { var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - buckets[bits-1].addMixed(&points[i]) + buckets[(bits>>1)-1].addMixed(&points[i]) } else { // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + buckets[(bits >> 1)].subMixed(&points[i]) } } diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index 0a8268cd6a..5fd9a64f32 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG1(&r16, 16, samplePoints[:], scalars16, true) + innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG2(&r16, 16, samplePoints[:], scalars16, true) + innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) diff --git a/ecc/bw6-756/g1.go b/ecc/bw6-756/g1.go index 038e4f1b42..d53b7f5f82 100644 --- a/ecc/bw6-756/g1.go +++ b/ecc/bw6-756/g1.go @@ -1019,7 +1019,7 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin baseTable[i].AddMixed(base) } - pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU()) + pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU()) // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) diff --git a/ecc/bw6-756/g2.go b/ecc/bw6-756/g2.go index cb9fadd15d..049841f4f7 100644 --- a/ecc/bw6-756/g2.go +++ b/ecc/bw6-756/g2.go @@ -879,7 +879,7 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin baseTable[i].AddMixed(base) } - pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU()) + pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU()) // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index ee5ff35a9a..04a630a92a 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -106,76 +106,69 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return C } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } + // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. + // nbSplits := 1 + C := bestC(nbPoints) + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ } // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + // var smallValues int + pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - + innerMsmG1(p, int(C), points, pscalars, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. - _p := make([]G1Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) + // _p := make([]G1Jac, nbSplits - 1) + // chDone := make(chan int, nbSplits - 1) + // for i:=0; i < nbSplits-1; i++ { + // start := i * nbPoints + // end := start + nbPoints + // go func(start, end, i int) { + // innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + // chDone <- i + // }(start, end, i) + // } + + // innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + // for i:=0; i < nbSplits-1; i++ { + // done := <-chDone + // p.AddAssign(&_p[done]) + // } + // close(chDone) return p, nil } -func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { +func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -194,10 +187,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp } // the last chunk may be processed with a different method than the rest, as it could be smaller. - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + n := len(points) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, scalars) + go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -205,12 +199,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, scalars) + go processChunk(0, chChunks[0], c, points, pscalars[:n]) } else { chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, chSplit, c, points[:split], scalars[:split]) - go processChunk(0, chSplit, c, points[split:], scalars[split:]) + split := n / 2 + go processChunk(0, chSplit, c, points[:split], pscalars[:split]) + go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -322,76 +316,69 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return C } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } + // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. + // nbSplits := 1 + C := bestC(nbPoints) + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ } // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + // var smallValues int + pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - + innerMsmG2(p, int(C), points, pscalars, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) + // _p := make([]G2Jac, nbSplits - 1) + // chDone := make(chan int, nbSplits - 1) + // for i:=0; i < nbSplits-1; i++ { + // start := i * nbPoints + // end := start + nbPoints + // go func(start, end, i int) { + // innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + // chDone <- i + // }(start, end, i) + // } + + // innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + // for i:=0; i < nbSplits-1; i++ { + // done := <-chDone + // p.AddAssign(&_p[done]) + // } + // close(chDone) return p, nil } -func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { +func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -410,10 +397,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp } // the last chunk may be processed with a different method than the rest, as it could be smaller. - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + n := len(points) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, scalars) + go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -421,12 +409,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, scalars) + go processChunk(0, chChunks[0], c, points, pscalars[:n]) } else { chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, chSplit, c, points[:split], scalars[:split]) - go processChunk(0, chSplit, c, points[split:], scalars[split:]) + split := n / 2 + go processChunk(0, chSplit, c, points[:split], pscalars[:split]) + go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -477,7 +465,132 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } + + toReturn := make([]uint32, len(scalars)*int(nbChunks)) + + mask := uint64((1 << c) - 1) // low c bits are 1 + // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) + } + selectors[chunk] = d + } + + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) + + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int + + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.FitsOnOneWord() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } + + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] + + // init with carry if any + digit := carry + carry = 0 + + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + + // if digit is zero, no impact on result + if digit == 0 { + continue + } + + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } + + var bits uint32 + if digit >= 0 { + bits = uint32(digit) << 1 + } else { + bits = (uint32(-digit-1) << 1) + 1 + } + toReturn[int(chunk)*len(scalars)+i] = bits + // [s.index] |= (bits << s.shift) + // if s.multiWordSelect { + // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + // } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues +} + +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { toReturn := make([]fr.Element, len(scalars)) // number of c-bit radixes in a scalar diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index 537221cb69..2a439173b9 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -16,10 +16,6 @@ package bw6756 -import ( - "github.com/consensys/gnark-crypto/ecc/bw6-756/fr" -) - const MAX_BATCH_SIZE = 600 type batchOp struct { @@ -40,35 +36,18 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - batch := newBatchG1Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue @@ -76,13 +55,13 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - op.bucketID = uint32(bits - 1) + op.bucketID = uint32((bits >> 1) - 1) // buckets[bits-1].Add(&points[i], &buckets[bits-1]) } else { // sub - op.bucketID = (uint32(bits & ^msbWindow)) + op.bucketID = (uint32((bits >> 1))) op.pointID += 1 // op.isNeg = true // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) @@ -261,35 +240,18 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - batch := newBatchG2Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue @@ -297,13 +259,13 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - op.bucketID = uint32(bits - 1) + op.bucketID = uint32((bits >> 1) - 1) // buckets[bits-1].Add(&points[i], &buckets[bits-1]) } else { // sub - op.bucketID = (uint32(bits & ^msbWindow)) + op.bucketID = (uint32((bits >> 1))) op.pointID += 1 // op.isNeg = true // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) diff --git a/ecc/bw6-756/multiexp_jacobian.go b/ecc/bw6-756/multiexp_jacobian.go index 984264456b..1f7ec4b3f8 100644 --- a/ecc/bw6-756/multiexp_jacobian.go +++ b/ecc/bw6-756/multiexp_jacobian.go @@ -16,54 +16,32 @@ package bw6756 -import ( - "github.com/consensys/gnark-crypto/ecc/bw6-756/fr" -) - func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + pscalars []uint32) { var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - buckets[bits-1].addMixed(&points[i]) + buckets[(bits>>1)-1].addMixed(&points[i]) } else { // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + buckets[(bits >> 1)].subMixed(&points[i]) } } @@ -101,46 +79,28 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + pscalars []uint32) { var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - buckets[bits-1].addMixed(&points[i]) + buckets[(bits>>1)-1].addMixed(&points[i]) } else { // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + buckets[(bits >> 1)].subMixed(&points[i]) } } diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index 0d0384701c..c8ddd47a9f 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG1(&r16, 16, samplePoints[:], scalars16, true) + innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG2(&r16, 16, samplePoints[:], scalars16, true) + innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) diff --git a/ecc/bw6-761/g1.go b/ecc/bw6-761/g1.go index 765d29433b..8694980eda 100644 --- a/ecc/bw6-761/g1.go +++ b/ecc/bw6-761/g1.go @@ -1030,7 +1030,7 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin baseTable[i].AddMixed(base) } - pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU()) + pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU()) // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) diff --git a/ecc/bw6-761/g2.go b/ecc/bw6-761/g2.go index fdb98731d4..3198411f9e 100644 --- a/ecc/bw6-761/g2.go +++ b/ecc/bw6-761/g2.go @@ -893,7 +893,7 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin baseTable[i].AddMixed(base) } - pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU()) + pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU()) // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index d9da35c23d..b0fb81e0e6 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -106,76 +106,69 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return C } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } + // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. + // nbSplits := 1 + C := bestC(nbPoints) + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ } // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + // var smallValues int + pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - + innerMsmG1(p, int(C), points, pscalars, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. - _p := make([]G1Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) + // _p := make([]G1Jac, nbSplits - 1) + // chDone := make(chan int, nbSplits - 1) + // for i:=0; i < nbSplits-1; i++ { + // start := i * nbPoints + // end := start + nbPoints + // go func(start, end, i int) { + // innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + // chDone <- i + // }(start, end, i) + // } + + // innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + // for i:=0; i < nbSplits-1; i++ { + // done := <-chDone + // p.AddAssign(&_p[done]) + // } + // close(chDone) return p, nil } -func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) { +func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -194,10 +187,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp } // the last chunk may be processed with a different method than the rest, as it could be smaller. - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + n := len(points) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, scalars) + go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -205,12 +199,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, scalars) + go processChunk(0, chChunks[0], c, points, pscalars[:n]) } else { chSplit := make(chan g1JacExtended, 2) - split := len(points) / 2 - go processChunk(0, chSplit, c, points[:split], scalars[:split]) - go processChunk(0, chSplit, c, points[split:], scalars[split:]) + split := n / 2 + go processChunk(0, chSplit, c, points[:split], pscalars[:split]) + go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -322,76 +316,69 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return C } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { - nbChunks++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } + // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. + // nbSplits := 1 + C := bestC(nbPoints) + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ } // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + // var smallValues int + pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - + innerMsmG2(p, int(C), points, pscalars, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. - _p := make([]G2Jac, nbSplits-1) - chDone := make(chan int, nbSplits-1) - for i := 0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } - innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) - for i := 0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) + // _p := make([]G2Jac, nbSplits - 1) + // chDone := make(chan int, nbSplits - 1) + // for i:=0; i < nbSplits-1; i++ { + // start := i * nbPoints + // end := start + nbPoints + // go func(start, end, i int) { + // innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + // chDone <- i + // }(start, end, i) + // } + + // innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + // for i:=0; i < nbSplits-1; i++ { + // done := <-chDone + // p.AddAssign(&_p[done]) + // } + // close(chDone) return p, nil } -func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) { +func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -410,10 +397,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp } // the last chunk may be processed with a different method than the rest, as it could be smaller. - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + n := len(points) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, scalars) + go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -421,12 +409,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, scalars) + go processChunk(0, chChunks[0], c, points, pscalars[:n]) } else { chSplit := make(chan g2JacExtended, 2) - split := len(points) / 2 - go processChunk(0, chSplit, c, points[:split], scalars[:split]) - go processChunk(0, chSplit, c, points[split:], scalars[split:]) + split := n / 2 + go processChunk(0, chSplit, c, points[:split], pscalars[:split]) + go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -477,7 +465,132 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } + + toReturn := make([]uint32, len(scalars)*int(nbChunks)) + + mask := uint64((1 << c) - 1) // low c bits are 1 + // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) + } + selectors[chunk] = d + } + + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) + + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int + + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.FitsOnOneWord() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } + + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] + + // init with carry if any + digit := carry + carry = 0 + + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + + // if digit is zero, no impact on result + if digit == 0 { + continue + } + + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } + + var bits uint32 + if digit >= 0 { + bits = uint32(digit) << 1 + } else { + bits = (uint32(-digit-1) << 1) + 1 + } + toReturn[int(chunk)*len(scalars)+i] = bits + // [s.index] |= (bits << s.shift) + // if s.multiWordSelect { + // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + // } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues +} + +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { toReturn := make([]fr.Element, len(scalars)) // number of c-bit radixes in a scalar diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index f012110b80..6fec5d8f63 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -16,10 +16,6 @@ package bw6761 -import ( - "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" -) - const MAX_BATCH_SIZE = 600 type batchOp struct { @@ -40,35 +36,18 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - batch := newBatchG1Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue @@ -76,13 +55,13 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - op.bucketID = uint32(bits - 1) + op.bucketID = uint32((bits >> 1) - 1) // buckets[bits-1].Add(&points[i], &buckets[bits-1]) } else { // sub - op.bucketID = (uint32(bits & ^msbWindow)) + op.bucketID = (uint32((bits >> 1))) op.pointID += 1 // op.isNeg = true // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) @@ -261,35 +240,18 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - batch := newBatchG2Affine(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue @@ -297,13 +259,13 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - op.bucketID = uint32(bits - 1) + op.bucketID = uint32((bits >> 1) - 1) // buckets[bits-1].Add(&points[i], &buckets[bits-1]) } else { // sub - op.bucketID = (uint32(bits & ^msbWindow)) + op.bucketID = (uint32((bits >> 1))) op.pointID += 1 // op.isNeg = true // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) diff --git a/ecc/bw6-761/multiexp_jacobian.go b/ecc/bw6-761/multiexp_jacobian.go index 6e2acf4b41..48249ca28f 100644 --- a/ecc/bw6-761/multiexp_jacobian.go +++ b/ecc/bw6-761/multiexp_jacobian.go @@ -16,54 +16,32 @@ package bw6761 -import ( - "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" -) - func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + pscalars []uint32) { var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - buckets[bits-1].addMixed(&points[i]) + buckets[(bits>>1)-1].addMixed(&points[i]) } else { // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + buckets[(bits >> 1)].subMixed(&points[i]) } } @@ -101,46 +79,28 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - scalars []fr.Element) { - - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) + pscalars []uint32) { var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - buckets[bits-1].addMixed(&points[i]) + buckets[(bits>>1)-1].addMixed(&points[i]) } else { // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + buckets[(bits >> 1)].subMixed(&points[i]) } } diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index fa82870fa4..cbcc319e1a 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG1(&r16, 16, samplePoints[:], scalars16, true) + innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG2(&r16, 16, samplePoints[:], scalars16, true) + innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index 6b5b935d81..bd4a489361 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -40,7 +40,136 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs * 64)%c != 0 { + nbChunks++ + } + + toReturn := make([]uint32, len(scalars)*int(nbChunks)) + + mask := uint64((1 << c) - 1) // low c bits are 1 + // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window + max := int(1 << (c -1)) // max value we want for our digits + cDivides64 := (64 %c ) == 0 // if c doesn't divide 64, we may need to select over multiple words + + + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk:=uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs - 1 ) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) + } + selectors[chunk] = d + } + + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) + + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i:=start; i < end; i++ { + var carry int + + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.FitsOnOneWord() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } + + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] + + // init with carry if any + digit := carry + carry = 0 + + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1] & s.maskHigh) << s.shiftHigh + } + + // if digit is zero, no impact on result + if digit == 0 { + continue + } + + + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } + + var bits uint32 + if digit >= 0 { + bits = uint32(digit) << 1 + } else { + bits = (uint32(-digit-1) << 1) + 1 + } + toReturn[int(chunk)*len(scalars)+i] = bits + // [s.index] |= (bits << s.shift) + // if s.multiWordSelect { + // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + // } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues+=o + } + return toReturn, smallValues +} + + +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { toReturn := make([]fr.Element, len(scalars)) // number of c-bit radixes in a scalar @@ -160,9 +289,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks return toReturn, smallValues } - - - {{define "multiexp" }} @@ -249,55 +375,49 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem return C } - var C uint64 - nbSplits := 1 - nbChunks := 0 - for nbChunks < config.NbTasks { - C = bestC(nbPoints) - nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs * 64) % C != 0 { - nbChunks ++ - } - nbChunks *= nbSplits - if nbChunks < config.NbTasks { - nbSplits <<= 1 - nbPoints >>= 1 - } + // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. + // nbSplits := 1 + C := bestC(nbPoints) + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs * 64) % C != 0 { + nbChunks ++ } // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - var smallValues int - scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + // var smallValues int + pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - + innerMsm{{ $.UPointName }}(p, int(C), points, pscalars, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. - _p := make([]{{ $.TJacobian }}, nbSplits - 1) - chDone := make(chan int, nbSplits - 1) - for i:=0; i < nbSplits-1; i++ { - start := i * nbPoints - end := start + nbPoints - go func(start, end, i int) { - innerMsm{{ $.UPointName }}(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - chDone <- i - }(start, end, i) - } + - innerMsm{{ $.UPointName }}(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - for i:=0; i < nbSplits-1; i++ { - done := <-chDone - p.AddAssign(&_p[done]) - } - close(chDone) + // _p := make([]{{ $.TJacobian }}, nbSplits - 1) + // chDone := make(chan int, nbSplits - 1) + // for i:=0; i < nbSplits-1; i++ { + // start := i * nbPoints + // end := start + nbPoints + // go func(start, end, i int) { + // innerMsm{{ $.UPointName }}(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + // chDone <- i + // }(start, end, i) + // } + + // innerMsm{{ $.UPointName }}(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) + // for i:=0; i < nbSplits-1; i++ { + // done := <-chDone + // p.AddAssign(&_p[done]) + // } + // close(chDone) return p, nil } -func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) { +func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, pscalars []uint32, splitFirstChunk bool) { {{- /* TODO @gbotrel need to deal with cases where lastC == 1 ; having a whole chunk with 1-bit window makes no sense */}} {{- /* also need to determine until which window size the ext-jacobian version is worth it. */}} switch c { @@ -310,14 +430,14 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}] {{- end}} {{- if eq $c $lc}} - _innerMsm{{ $.UPointName }}(p, {{$c}}, points, scalars, splitFirstChunk, processChunk, processChunk) + _innerMsm{{ $.UPointName }}(p, {{$c}}, points, pscalars, splitFirstChunk, processChunk, processChunk) {{- else}} {{- if le $lc 9}} processLastChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{lastC $c}}] {{- else}} processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}] {{- end}} - _innerMsm{{ $.UPointName }}(p, {{$c}}, points, scalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsm{{ $.UPointName }}(p, {{$c}}, points, pscalars, splitFirstChunk, processChunk, processLastChunk) {{- end}} {{- end}} default: @@ -325,8 +445,8 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi } } -func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element)) *{{ $.TJacobian }} { +func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, pscalars []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, pscalars []uint32)) *{{ $.TJacobian }} { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -345,10 +465,11 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T } // the last chunk may be processed with a different method than the rest, as it could be smaller. - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars) + n := len(points) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j >0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, scalars) + go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -356,12 +477,12 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0,chChunks[0], c, points, scalars) + go processChunk(0,chChunks[0], c, points, pscalars[:n]) } else { chSplit := make(chan {{ $.TJacobianExtended }}, 2) - split := len(points) / 2 - go processChunk(0,chSplit, c, points[:split], scalars[:split]) - go processChunk(0,chSplit, c, points[split:], scalars[split:]) + split := n / 2 + go processChunk(0,chSplit, c, points[:split], pscalars[:split]) + go processChunk(0,chSplit, c, points[split:], pscalars[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index 0c0fba41e5..518c7e3406 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -6,9 +6,6 @@ {{ $G2TJacobian := print (toUpper .G2.PointName) "Jac" }} {{ $G2TJacobianExtended := print (toLower .G2.PointName) "JacExtended" }} -import ( - "github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr" -) const MAX_BATCH_SIZE = 600 @@ -40,35 +37,18 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64 chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c - 1)) var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64 %c)!=0 && s.shift > (64-c) && s.index < (fr.Limbs - 1 ) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - batch := newBatch{{ $.TAffine }}(&buckets, points) queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue @@ -76,13 +56,13 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64 op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&msbWindow == 0 { + if bits&1 == 0 { // add - op.bucketID = uint32(bits - 1) + op.bucketID = uint32((bits>>1) - 1) // buckets[bits-1].Add(&points[i], &buckets[bits-1]) } else { // sub - op.bucketID = (uint32(bits & ^msbWindow)) + op.bucketID = (uint32((bits>>1))) op.pointID += 1 // op.isNeg = true // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) diff --git a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl index d4e00fa442..8fb94f9f5b 100644 --- a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl +++ b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl @@ -6,9 +6,6 @@ {{ $G2TJacobian := print (toUpper .G2.PointName) "Jac" }} {{ $G2TJacobianExtended := print (toLower .G2.PointName) "JacExtended" }} -import ( - "github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr" -) {{ template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange "LastCRange" .G1.LastCRange}} @@ -22,48 +19,30 @@ func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, - scalars []fr.Element) { + pscalars []uint32) { - mask := uint64((1 << c) - 1) // low c bits are 1 - msbWindow := uint64(1 << (c -1)) var buckets B for i := 0 ; i < len(buckets); i++ { buckets[i].setInfinity() } - jc := uint64(chunk * c) - s := selector{} - s.index = jc / 64 - s.shift = jc - (s.index * 64) - s.mask = mask << s.shift - s.multiWordSelect = (64 %c)!=0 && s.shift > (64-c) && s.index < (fr.Limbs - 1 ) - if s.multiWordSelect { - nbBitsHigh := s.shift - uint64(64-c) - s.maskHigh = (1 << nbBitsHigh) - 1 - s.shiftHigh = (c - nbBitsHigh) - } - - // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(scalars); i++ { - bits := (scalars[i][s.index] & s.mask) >> s.shift - if s.multiWordSelect { - bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh - } + for i := 0; i < len(pscalars); i++ { + bits := pscalars[i] if bits == 0 { continue } // if msbWindow bit is set, we need to substract - if bits & msbWindow == 0 { + if bits & 1 == 0 { // add - buckets[bits-1].addMixed(&points[i]) + buckets[(bits>>1)-1].addMixed(&points[i]) } else { // sub - buckets[bits & ^msbWindow].subMixed(&points[i]) + buckets[(bits>>1)].subMixed(&points[i]) } } diff --git a/internal/generator/ecc/template/point.go.tmpl b/internal/generator/ecc/template/point.go.tmpl index bbc5ec8980..9fc9cc1651 100644 --- a/internal/generator/ecc/template/point.go.tmpl +++ b/internal/generator/ecc/template/point.go.tmpl @@ -1480,7 +1480,7 @@ func BatchScalarMultiplication{{ toUpper .PointName }}(base *{{ $TAffine }}, sca baseTable[i].AddMixed(base) } - pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU()) + pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU()) // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index ef51368fbc..5fa8d37944 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -91,7 +91,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { } scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsm{{ toUpper $.PointName }}(&r16, 16, samplePoints[:], scalars16, true) + innerMsm{{ toUpper $.PointName }}(&r16, 16, samplePointsLarge[:], scalars16, true) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) From c8613e89b60910631bb906406594751e099eae3a Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 8 Nov 2022 16:59:58 -0600 Subject: [PATCH 07/43] feat: gymnastic to ensure buckets are on the stack -- compiler hints --- ecc/bls12-377/multiexp.go | 100 ++-- ecc/bls12-377/multiexp_affine.go | 456 ++++++++---------- ecc/bls12-377/multiexp_jacobian.go | 28 +- ecc/bls12-377/multiexp_test.go | 4 +- ecc/bls12-378/multiexp.go | 100 ++-- ecc/bls12-378/multiexp_affine.go | 456 ++++++++---------- ecc/bls12-378/multiexp_jacobian.go | 28 +- ecc/bls12-378/multiexp_test.go | 4 +- ecc/bls12-381/multiexp.go | 100 ++-- ecc/bls12-381/multiexp_affine.go | 456 ++++++++---------- ecc/bls12-381/multiexp_jacobian.go | 28 +- ecc/bls12-381/multiexp_test.go | 4 +- ecc/bls24-315/multiexp.go | 100 ++-- ecc/bls24-315/multiexp_affine.go | 456 ++++++++---------- ecc/bls24-315/multiexp_jacobian.go | 28 +- ecc/bls24-315/multiexp_test.go | 4 +- ecc/bls24-317/multiexp.go | 100 ++-- ecc/bls24-317/multiexp_affine.go | 456 ++++++++---------- ecc/bls24-317/multiexp_jacobian.go | 28 +- ecc/bls24-317/multiexp_test.go | 4 +- ecc/bn254/multiexp.go | 100 ++-- ecc/bn254/multiexp_affine.go | 456 ++++++++---------- ecc/bn254/multiexp_jacobian.go | 28 +- ecc/bn254/multiexp_test.go | 4 +- ecc/bw6-633/multiexp.go | 56 +-- ecc/bw6-633/multiexp_affine.go | 456 ++++++++---------- ecc/bw6-633/multiexp_jacobian.go | 28 +- ecc/bw6-633/multiexp_test.go | 4 +- ecc/bw6-756/multiexp.go | 56 +-- ecc/bw6-756/multiexp_affine.go | 456 ++++++++---------- ecc/bw6-756/multiexp_jacobian.go | 28 +- ecc/bw6-756/multiexp_test.go | 4 +- ecc/bw6-761/multiexp.go | 56 +-- ecc/bw6-761/multiexp_affine.go | 456 ++++++++---------- ecc/bw6-761/multiexp_jacobian.go | 28 +- ecc/bw6-761/multiexp_test.go | 4 +- .../generator/ecc/template/multiexp.go.tmpl | 24 +- .../ecc/template/multiexp_affine.go.tmpl | 241 +++++---- .../ecc/template/multiexp_jacobian.go.tmpl | 14 +- .../ecc/template/tests/multiexp.go.tmpl | 2 +- 40 files changed, 2532 insertions(+), 2909 deletions(-) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index 4fad52e512..89db336b35 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -118,12 +118,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG1(p, int(C), points, pscalars, splitFirstChunk) + innerMsmG1(p, int(C), points, digits, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. // _p := make([]G1Jac, nbSplits - 1) @@ -146,73 +146,73 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) { +func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG1BatchAffine[bucketG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - _innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG1BatchAffine[bucketG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG1BatchAffine[bucketG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG1BatchAffine[bucketG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - _innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG1BatchAffine[bucketG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG1BatchAffine[bucketG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG1BatchAffine[bucketG1AffineC20] processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG1BatchAffine[bucketG1AffineC21] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -232,10 +232,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -243,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, pscalars[:n]) + go processChunk(0, chChunks[0], c, points, digits[:n]) } else { chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], pscalars[:split]) - go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) + go processChunk(0, chSplit, c, points[:split], digits[:split]) + go processChunk(0, chSplit, c, points[split:], digits[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -372,12 +372,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG2(p, int(C), points, pscalars, splitFirstChunk) + innerMsmG2(p, int(C), points, digits, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. // _p := make([]G2Jac, nbSplits - 1) @@ -400,73 +400,73 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) { +func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG2BatchAffine[bucketG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - _innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG2BatchAffine[bucketG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG2BatchAffine[bucketG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG2BatchAffine[bucketG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - _innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG2BatchAffine[bucketG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG2BatchAffine[bucketG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG2BatchAffine[bucketG2AffineC20] processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG2BatchAffine[bucketG2AffineC21] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -486,10 +486,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -497,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, pscalars[:n]) + go processChunk(0, chChunks[0], c, points, digits[:n]) } else { chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], pscalars[:split]) - go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) + go processChunk(0, chSplit, c, points[:split], digits[:split]) + go processChunk(0, chSplit, c, points[split:], digits[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index c3f89f7406..c2e56a6936 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -36,45 +36,134 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - pscalars []uint32) { + digits []uint32) { + // init the buckets var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - batch := newBatchG1Affine(&buckets, points) + // setup for the batch affine; + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + cptP := 0 // count the number of point added to current batch + + var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack + var R [MAX_BATCH_SIZE]*G1Affine // ... + + canAdd := func(bID uint32) bool { + _, ok := bucketIds[bID] + return !ok + } + + isFull := func() bool { + return cptP == batchSize + } + + executeAndReset := func() { + if cptP == 0 { + return + } + BatchAddG1Affine(R[:cptP], P[:cptP], cptP) + for k := range bucketIds { + delete(bucketIds, k) + } + cptP = 0 + } + + add := func(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[op.bucketID] + PP := &points[op.pointID>>1] + if PP.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(PP) + } else { + BK.Set(PP) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(PP) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { + BK.setInfinity() + return + } + } + + // bucketIds[cptP] = op.bucketID + bucketIds[op.bucketID] = struct{}{} + R[cptP] = BK + if op.isNeg() { + P[cptP].Neg(PP) + } else { + P[cptP].Set(PP) + } + cptP++ + } + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + + processQueue := func() { + // for i := len(queue) - 1; i >= 0; i-- { + for i := 0; i < len(queue); i++ { + if canAdd(queue[i].bucketID) { + add(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + i-- + } + } + } + nbBatches := 0 - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] + for i, digit := range digits { - if bits == 0 { + if digit == 0 { continue } op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - op.bucketID = uint32((bits >> 1) - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + op.bucketID = uint32((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((bits >> 1))) + op.bucketID = (uint32((digit >> 1))) op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() + if canAdd(op.bucketID) { + add(op) + if isFull() { + executeAndReset() nbBatches++ if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) + add(queue[len(queue)-1]) queue = queue[:len(queue)-1] } + // processQueue() } } else { // put it in queue. @@ -83,14 +172,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() + // executeAndReset() for len(queue) != 0 { - queue = processQueueG1Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. + processQueue() + executeAndReset() // execute batch even if not full. } // flush items in batch. - batch.ExecuteAndReset() + executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -145,162 +234,144 @@ type ibG1Affine interface { bucketG1AffineC21 } -type BatchG1Affine[B ibG1Affine] struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G1Affine - buckets *B -} +// processChunkG2BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. +// +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + digits []uint32) { + + // init the buckets + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } -func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { - batchSize := len(*buckets) / 5 + // setup for the batch affine; + batchSize := len(buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG1Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} + bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + cptP := 0 // count the number of point added to current batch -func (b *BatchG1Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} + var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack + var R [MAX_BATCH_SIZE]*G2Affine // ... -func (b *BatchG1Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return + canAdd := func(bID uint32) bool { + _, ok := bucketIds[bID] + return !ok } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} -func (b *BatchG1Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return + isFull := func() bool { + return cptP == batchSize } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() + + executeAndReset := func() { + if cptP == 0 { return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return + BatchAddG2Affine(R[:cptP], P[:cptP], cptP) + for k := range bucketIds { + delete(bucketIds, k) } + cptP = 0 } - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} + add := func(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch -func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() + BK := &buckets[op.bucketID] + PP := &points[op.pointID>>1] + if PP.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(PP) + } else { + BK.Set(PP) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(PP) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { + BK.setInfinity() + return } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] } - } - return queue -} + // bucketIds[cptP] = op.bucketID + bucketIds[op.bucketID] = struct{}{} + R[cptP] = BK + if op.isNeg() { + P[cptP].Neg(PP) + } else { + P[cptP].Set(PP) + } + cptP++ + } -// processChunkG2BatchAffine process a chunk of the scalars during the msm -// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition -// we use a batch affine addition. -// -// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 -// See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - pscalars []uint32) { + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() + processQueue := func() { + // for i := len(queue) - 1; i >= 0; i-- { + for i := 0; i < len(queue); i++ { + if canAdd(queue[i].bucketID) { + add(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + i-- + } + } } - batch := newBatchG2Affine(&buckets, points) - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] + for i, digit := range digits { - if bits == 0 { + if digit == 0 { continue } op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - op.bucketID = uint32((bits >> 1) - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + op.bucketID = uint32((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((bits >> 1))) + op.bucketID = (uint32((digit >> 1))) op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() + if canAdd(op.bucketID) { + add(op) + if isFull() { + executeAndReset() nbBatches++ if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) + add(queue[len(queue)-1]) queue = queue[:len(queue)-1] } + // processQueue() } } else { // put it in queue. @@ -309,14 +380,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() + // executeAndReset() for len(queue) != 0 { - queue = processQueueG2Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. + processQueue() + executeAndReset() // execute batch even if not full. } // flush items in batch. - batch.ExecuteAndReset() + executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -370,110 +441,3 @@ type ibG2Affine interface { bucketG2AffineC20 | bucketG2AffineC21 } - -type BatchG2Affine[B ibG2Affine] struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G2Affine - buckets *B -} - -func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { - batchSize := len(*buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG2Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} - -func (b *BatchG2Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG2Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG2Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go index 1981509ee0..2c95e7f536 100644 --- a/ecc/bls12-377/multiexp_jacobian.go +++ b/ecc/bls12-377/multiexp_jacobian.go @@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - pscalars []uint32) { + digits []uint32) { var buckets B for i := 0; i < len(buckets); i++ { @@ -28,20 +28,18 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, } // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] - - if bits == 0 { + for i, digit := range digits { + if digit == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - buckets[(bits>>1)-1].addMixed(&points[i]) + buckets[(digit>>1)-1].addMixed(&points[i]) } else { // sub - buckets[(bits >> 1)].subMixed(&points[i]) + buckets[(digit >> 1)].subMixed(&points[i]) } } @@ -105,7 +103,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - pscalars []uint32) { + digits []uint32) { var buckets B for i := 0; i < len(buckets); i++ { @@ -113,20 +111,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, } // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] - - if bits == 0 { + for i, digit := range digits { + if digit == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - buckets[(bits>>1)-1].addMixed(&points[i]) + buckets[(digit>>1)-1].addMixed(&points[i]) } else { // sub - buckets[(bits >> 1)].subMixed(&points[i]) + buckets[(digit >> 1)].subMixed(&points[i]) } } diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index fe84ecc91f..afc5108951 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 11; i <= pow; i++ { + for i := 14; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { @@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 11; i <= pow; i++ { + for i := 14; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index 200a5fc096..5914a5a0d6 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -118,12 +118,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG1(p, int(C), points, pscalars, splitFirstChunk) + innerMsmG1(p, int(C), points, digits, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. // _p := make([]G1Jac, nbSplits - 1) @@ -146,73 +146,73 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) { +func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG1BatchAffine[bucketG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - _innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG1BatchAffine[bucketG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG1BatchAffine[bucketG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG1BatchAffine[bucketG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - _innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG1BatchAffine[bucketG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG1BatchAffine[bucketG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG1BatchAffine[bucketG1AffineC20] processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG1BatchAffine[bucketG1AffineC21] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -232,10 +232,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -243,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, pscalars[:n]) + go processChunk(0, chChunks[0], c, points, digits[:n]) } else { chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], pscalars[:split]) - go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) + go processChunk(0, chSplit, c, points[:split], digits[:split]) + go processChunk(0, chSplit, c, points[split:], digits[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -372,12 +372,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG2(p, int(C), points, pscalars, splitFirstChunk) + innerMsmG2(p, int(C), points, digits, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. // _p := make([]G2Jac, nbSplits - 1) @@ -400,73 +400,73 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) { +func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG2BatchAffine[bucketG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - _innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG2BatchAffine[bucketG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG2BatchAffine[bucketG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG2BatchAffine[bucketG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - _innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG2BatchAffine[bucketG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG2BatchAffine[bucketG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG2BatchAffine[bucketG2AffineC20] processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG2BatchAffine[bucketG2AffineC21] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -486,10 +486,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -497,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, pscalars[:n]) + go processChunk(0, chChunks[0], c, points, digits[:n]) } else { chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], pscalars[:split]) - go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) + go processChunk(0, chSplit, c, points[:split], digits[:split]) + go processChunk(0, chSplit, c, points[split:], digits[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index 06c5f74bfe..64ca8320b2 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -36,45 +36,134 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - pscalars []uint32) { + digits []uint32) { + // init the buckets var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - batch := newBatchG1Affine(&buckets, points) + // setup for the batch affine; + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + cptP := 0 // count the number of point added to current batch + + var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack + var R [MAX_BATCH_SIZE]*G1Affine // ... + + canAdd := func(bID uint32) bool { + _, ok := bucketIds[bID] + return !ok + } + + isFull := func() bool { + return cptP == batchSize + } + + executeAndReset := func() { + if cptP == 0 { + return + } + BatchAddG1Affine(R[:cptP], P[:cptP], cptP) + for k := range bucketIds { + delete(bucketIds, k) + } + cptP = 0 + } + + add := func(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[op.bucketID] + PP := &points[op.pointID>>1] + if PP.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(PP) + } else { + BK.Set(PP) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(PP) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { + BK.setInfinity() + return + } + } + + // bucketIds[cptP] = op.bucketID + bucketIds[op.bucketID] = struct{}{} + R[cptP] = BK + if op.isNeg() { + P[cptP].Neg(PP) + } else { + P[cptP].Set(PP) + } + cptP++ + } + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + + processQueue := func() { + // for i := len(queue) - 1; i >= 0; i-- { + for i := 0; i < len(queue); i++ { + if canAdd(queue[i].bucketID) { + add(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + i-- + } + } + } + nbBatches := 0 - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] + for i, digit := range digits { - if bits == 0 { + if digit == 0 { continue } op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - op.bucketID = uint32((bits >> 1) - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + op.bucketID = uint32((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((bits >> 1))) + op.bucketID = (uint32((digit >> 1))) op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() + if canAdd(op.bucketID) { + add(op) + if isFull() { + executeAndReset() nbBatches++ if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) + add(queue[len(queue)-1]) queue = queue[:len(queue)-1] } + // processQueue() } } else { // put it in queue. @@ -83,14 +172,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() + // executeAndReset() for len(queue) != 0 { - queue = processQueueG1Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. + processQueue() + executeAndReset() // execute batch even if not full. } // flush items in batch. - batch.ExecuteAndReset() + executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -145,162 +234,144 @@ type ibG1Affine interface { bucketG1AffineC21 } -type BatchG1Affine[B ibG1Affine] struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G1Affine - buckets *B -} +// processChunkG2BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. +// +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + digits []uint32) { + + // init the buckets + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } -func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { - batchSize := len(*buckets) / 5 + // setup for the batch affine; + batchSize := len(buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG1Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} + bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + cptP := 0 // count the number of point added to current batch -func (b *BatchG1Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} + var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack + var R [MAX_BATCH_SIZE]*G2Affine // ... -func (b *BatchG1Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return + canAdd := func(bID uint32) bool { + _, ok := bucketIds[bID] + return !ok } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} -func (b *BatchG1Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return + isFull := func() bool { + return cptP == batchSize } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() + + executeAndReset := func() { + if cptP == 0 { return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return + BatchAddG2Affine(R[:cptP], P[:cptP], cptP) + for k := range bucketIds { + delete(bucketIds, k) } + cptP = 0 } - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} + add := func(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch -func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() + BK := &buckets[op.bucketID] + PP := &points[op.pointID>>1] + if PP.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(PP) + } else { + BK.Set(PP) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(PP) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { + BK.setInfinity() + return } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] } - } - return queue -} + // bucketIds[cptP] = op.bucketID + bucketIds[op.bucketID] = struct{}{} + R[cptP] = BK + if op.isNeg() { + P[cptP].Neg(PP) + } else { + P[cptP].Set(PP) + } + cptP++ + } -// processChunkG2BatchAffine process a chunk of the scalars during the msm -// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition -// we use a batch affine addition. -// -// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 -// See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - pscalars []uint32) { + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() + processQueue := func() { + // for i := len(queue) - 1; i >= 0; i-- { + for i := 0; i < len(queue); i++ { + if canAdd(queue[i].bucketID) { + add(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + i-- + } + } } - batch := newBatchG2Affine(&buckets, points) - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] + for i, digit := range digits { - if bits == 0 { + if digit == 0 { continue } op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - op.bucketID = uint32((bits >> 1) - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + op.bucketID = uint32((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((bits >> 1))) + op.bucketID = (uint32((digit >> 1))) op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() + if canAdd(op.bucketID) { + add(op) + if isFull() { + executeAndReset() nbBatches++ if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) + add(queue[len(queue)-1]) queue = queue[:len(queue)-1] } + // processQueue() } } else { // put it in queue. @@ -309,14 +380,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() + // executeAndReset() for len(queue) != 0 { - queue = processQueueG2Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. + processQueue() + executeAndReset() // execute batch even if not full. } // flush items in batch. - batch.ExecuteAndReset() + executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -370,110 +441,3 @@ type ibG2Affine interface { bucketG2AffineC20 | bucketG2AffineC21 } - -type BatchG2Affine[B ibG2Affine] struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G2Affine - buckets *B -} - -func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { - batchSize := len(*buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG2Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} - -func (b *BatchG2Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG2Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG2Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go index 592070f11d..3ce29436eb 100644 --- a/ecc/bls12-378/multiexp_jacobian.go +++ b/ecc/bls12-378/multiexp_jacobian.go @@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - pscalars []uint32) { + digits []uint32) { var buckets B for i := 0; i < len(buckets); i++ { @@ -28,20 +28,18 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, } // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] - - if bits == 0 { + for i, digit := range digits { + if digit == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - buckets[(bits>>1)-1].addMixed(&points[i]) + buckets[(digit>>1)-1].addMixed(&points[i]) } else { // sub - buckets[(bits >> 1)].subMixed(&points[i]) + buckets[(digit >> 1)].subMixed(&points[i]) } } @@ -105,7 +103,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - pscalars []uint32) { + digits []uint32) { var buckets B for i := 0; i < len(buckets); i++ { @@ -113,20 +111,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, } // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] - - if bits == 0 { + for i, digit := range digits { + if digit == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - buckets[(bits>>1)-1].addMixed(&points[i]) + buckets[(digit>>1)-1].addMixed(&points[i]) } else { // sub - buckets[(bits >> 1)].subMixed(&points[i]) + buckets[(digit >> 1)].subMixed(&points[i]) } } diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index a94fc2e0b9..a77f7097e1 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 11; i <= pow; i++ { + for i := 14; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { @@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 11; i <= pow; i++ { + for i := 14; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index 50b716e3c4..7f730ca946 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -118,12 +118,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG1(p, int(C), points, pscalars, splitFirstChunk) + innerMsmG1(p, int(C), points, digits, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. // _p := make([]G1Jac, nbSplits - 1) @@ -146,73 +146,73 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) { +func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG1BatchAffine[bucketG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - _innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG1BatchAffine[bucketG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG1BatchAffine[bucketG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG1BatchAffine[bucketG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - _innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG1BatchAffine[bucketG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG1BatchAffine[bucketG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG1BatchAffine[bucketG1AffineC20] processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG1BatchAffine[bucketG1AffineC21] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -232,10 +232,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -243,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, pscalars[:n]) + go processChunk(0, chChunks[0], c, points, digits[:n]) } else { chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], pscalars[:split]) - go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) + go processChunk(0, chSplit, c, points[:split], digits[:split]) + go processChunk(0, chSplit, c, points[split:], digits[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -372,12 +372,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG2(p, int(C), points, pscalars, splitFirstChunk) + innerMsmG2(p, int(C), points, digits, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. // _p := make([]G2Jac, nbSplits - 1) @@ -400,73 +400,73 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) { +func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG2BatchAffine[bucketG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - _innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG2BatchAffine[bucketG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG2BatchAffine[bucketG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG2BatchAffine[bucketG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - _innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG2BatchAffine[bucketG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG2BatchAffine[bucketG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG2BatchAffine[bucketG2AffineC20] processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG2BatchAffine[bucketG2AffineC21] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -486,10 +486,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -497,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, pscalars[:n]) + go processChunk(0, chChunks[0], c, points, digits[:n]) } else { chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], pscalars[:split]) - go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) + go processChunk(0, chSplit, c, points[:split], digits[:split]) + go processChunk(0, chSplit, c, points[split:], digits[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index 3f0eeabba3..c965e24de2 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -36,45 +36,134 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - pscalars []uint32) { + digits []uint32) { + // init the buckets var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - batch := newBatchG1Affine(&buckets, points) + // setup for the batch affine; + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + cptP := 0 // count the number of point added to current batch + + var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack + var R [MAX_BATCH_SIZE]*G1Affine // ... + + canAdd := func(bID uint32) bool { + _, ok := bucketIds[bID] + return !ok + } + + isFull := func() bool { + return cptP == batchSize + } + + executeAndReset := func() { + if cptP == 0 { + return + } + BatchAddG1Affine(R[:cptP], P[:cptP], cptP) + for k := range bucketIds { + delete(bucketIds, k) + } + cptP = 0 + } + + add := func(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[op.bucketID] + PP := &points[op.pointID>>1] + if PP.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(PP) + } else { + BK.Set(PP) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(PP) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { + BK.setInfinity() + return + } + } + + // bucketIds[cptP] = op.bucketID + bucketIds[op.bucketID] = struct{}{} + R[cptP] = BK + if op.isNeg() { + P[cptP].Neg(PP) + } else { + P[cptP].Set(PP) + } + cptP++ + } + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + + processQueue := func() { + // for i := len(queue) - 1; i >= 0; i-- { + for i := 0; i < len(queue); i++ { + if canAdd(queue[i].bucketID) { + add(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + i-- + } + } + } + nbBatches := 0 - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] + for i, digit := range digits { - if bits == 0 { + if digit == 0 { continue } op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - op.bucketID = uint32((bits >> 1) - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + op.bucketID = uint32((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((bits >> 1))) + op.bucketID = (uint32((digit >> 1))) op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() + if canAdd(op.bucketID) { + add(op) + if isFull() { + executeAndReset() nbBatches++ if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) + add(queue[len(queue)-1]) queue = queue[:len(queue)-1] } + // processQueue() } } else { // put it in queue. @@ -83,14 +172,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() + // executeAndReset() for len(queue) != 0 { - queue = processQueueG1Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. + processQueue() + executeAndReset() // execute batch even if not full. } // flush items in batch. - batch.ExecuteAndReset() + executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -145,162 +234,144 @@ type ibG1Affine interface { bucketG1AffineC21 } -type BatchG1Affine[B ibG1Affine] struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G1Affine - buckets *B -} +// processChunkG2BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. +// +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + digits []uint32) { + + // init the buckets + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } -func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { - batchSize := len(*buckets) / 5 + // setup for the batch affine; + batchSize := len(buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG1Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} + bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + cptP := 0 // count the number of point added to current batch -func (b *BatchG1Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} + var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack + var R [MAX_BATCH_SIZE]*G2Affine // ... -func (b *BatchG1Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return + canAdd := func(bID uint32) bool { + _, ok := bucketIds[bID] + return !ok } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} -func (b *BatchG1Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return + isFull := func() bool { + return cptP == batchSize } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() + + executeAndReset := func() { + if cptP == 0 { return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return + BatchAddG2Affine(R[:cptP], P[:cptP], cptP) + for k := range bucketIds { + delete(bucketIds, k) } + cptP = 0 } - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} + add := func(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch -func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() + BK := &buckets[op.bucketID] + PP := &points[op.pointID>>1] + if PP.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(PP) + } else { + BK.Set(PP) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(PP) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { + BK.setInfinity() + return } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] } - } - return queue -} + // bucketIds[cptP] = op.bucketID + bucketIds[op.bucketID] = struct{}{} + R[cptP] = BK + if op.isNeg() { + P[cptP].Neg(PP) + } else { + P[cptP].Set(PP) + } + cptP++ + } -// processChunkG2BatchAffine process a chunk of the scalars during the msm -// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition -// we use a batch affine addition. -// -// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 -// See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - pscalars []uint32) { + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() + processQueue := func() { + // for i := len(queue) - 1; i >= 0; i-- { + for i := 0; i < len(queue); i++ { + if canAdd(queue[i].bucketID) { + add(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + i-- + } + } } - batch := newBatchG2Affine(&buckets, points) - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] + for i, digit := range digits { - if bits == 0 { + if digit == 0 { continue } op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - op.bucketID = uint32((bits >> 1) - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + op.bucketID = uint32((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((bits >> 1))) + op.bucketID = (uint32((digit >> 1))) op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() + if canAdd(op.bucketID) { + add(op) + if isFull() { + executeAndReset() nbBatches++ if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) + add(queue[len(queue)-1]) queue = queue[:len(queue)-1] } + // processQueue() } } else { // put it in queue. @@ -309,14 +380,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() + // executeAndReset() for len(queue) != 0 { - queue = processQueueG2Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. + processQueue() + executeAndReset() // execute batch even if not full. } // flush items in batch. - batch.ExecuteAndReset() + executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -370,110 +441,3 @@ type ibG2Affine interface { bucketG2AffineC20 | bucketG2AffineC21 } - -type BatchG2Affine[B ibG2Affine] struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G2Affine - buckets *B -} - -func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { - batchSize := len(*buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG2Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} - -func (b *BatchG2Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG2Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG2Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go index 3840228907..7c69354658 100644 --- a/ecc/bls12-381/multiexp_jacobian.go +++ b/ecc/bls12-381/multiexp_jacobian.go @@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - pscalars []uint32) { + digits []uint32) { var buckets B for i := 0; i < len(buckets); i++ { @@ -28,20 +28,18 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, } // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] - - if bits == 0 { + for i, digit := range digits { + if digit == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - buckets[(bits>>1)-1].addMixed(&points[i]) + buckets[(digit>>1)-1].addMixed(&points[i]) } else { // sub - buckets[(bits >> 1)].subMixed(&points[i]) + buckets[(digit >> 1)].subMixed(&points[i]) } } @@ -105,7 +103,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - pscalars []uint32) { + digits []uint32) { var buckets B for i := 0; i < len(buckets); i++ { @@ -113,20 +111,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, } // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] - - if bits == 0 { + for i, digit := range digits { + if digit == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - buckets[(bits>>1)-1].addMixed(&points[i]) + buckets[(digit>>1)-1].addMixed(&points[i]) } else { // sub - buckets[(bits >> 1)].subMixed(&points[i]) + buckets[(digit >> 1)].subMixed(&points[i]) } } diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index 8e356fde9d..457546524f 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 11; i <= pow; i++ { + for i := 14; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { @@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 11; i <= pow; i++ { + for i := 14; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index 912d69ba44..37b43c6fe8 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -118,12 +118,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG1(p, int(C), points, pscalars, splitFirstChunk) + innerMsmG1(p, int(C), points, digits, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. // _p := make([]G1Jac, nbSplits - 1) @@ -146,73 +146,73 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) { +func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG1BatchAffine[bucketG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - _innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG1BatchAffine[bucketG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG1BatchAffine[bucketG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG1BatchAffine[bucketG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - _innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG1BatchAffine[bucketG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG1BatchAffine[bucketG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG1BatchAffine[bucketG1AffineC20] processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG1BatchAffine[bucketG1AffineC21] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -232,10 +232,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -243,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, pscalars[:n]) + go processChunk(0, chChunks[0], c, points, digits[:n]) } else { chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], pscalars[:split]) - go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) + go processChunk(0, chSplit, c, points[:split], digits[:split]) + go processChunk(0, chSplit, c, points[split:], digits[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -372,12 +372,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG2(p, int(C), points, pscalars, splitFirstChunk) + innerMsmG2(p, int(C), points, digits, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. // _p := make([]G2Jac, nbSplits - 1) @@ -400,73 +400,73 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) { +func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG2BatchAffine[bucketG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - _innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG2BatchAffine[bucketG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG2BatchAffine[bucketG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG2BatchAffine[bucketG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - _innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG2BatchAffine[bucketG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG2BatchAffine[bucketG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG2BatchAffine[bucketG2AffineC20] processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG2BatchAffine[bucketG2AffineC21] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -486,10 +486,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -497,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, pscalars[:n]) + go processChunk(0, chChunks[0], c, points, digits[:n]) } else { chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], pscalars[:split]) - go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) + go processChunk(0, chSplit, c, points[:split], digits[:split]) + go processChunk(0, chSplit, c, points[split:], digits[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index 93b8a89cdb..10c47b3306 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -36,45 +36,134 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - pscalars []uint32) { + digits []uint32) { + // init the buckets var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - batch := newBatchG1Affine(&buckets, points) + // setup for the batch affine; + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + cptP := 0 // count the number of point added to current batch + + var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack + var R [MAX_BATCH_SIZE]*G1Affine // ... + + canAdd := func(bID uint32) bool { + _, ok := bucketIds[bID] + return !ok + } + + isFull := func() bool { + return cptP == batchSize + } + + executeAndReset := func() { + if cptP == 0 { + return + } + BatchAddG1Affine(R[:cptP], P[:cptP], cptP) + for k := range bucketIds { + delete(bucketIds, k) + } + cptP = 0 + } + + add := func(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[op.bucketID] + PP := &points[op.pointID>>1] + if PP.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(PP) + } else { + BK.Set(PP) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(PP) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { + BK.setInfinity() + return + } + } + + // bucketIds[cptP] = op.bucketID + bucketIds[op.bucketID] = struct{}{} + R[cptP] = BK + if op.isNeg() { + P[cptP].Neg(PP) + } else { + P[cptP].Set(PP) + } + cptP++ + } + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + + processQueue := func() { + // for i := len(queue) - 1; i >= 0; i-- { + for i := 0; i < len(queue); i++ { + if canAdd(queue[i].bucketID) { + add(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + i-- + } + } + } + nbBatches := 0 - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] + for i, digit := range digits { - if bits == 0 { + if digit == 0 { continue } op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - op.bucketID = uint32((bits >> 1) - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + op.bucketID = uint32((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((bits >> 1))) + op.bucketID = (uint32((digit >> 1))) op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() + if canAdd(op.bucketID) { + add(op) + if isFull() { + executeAndReset() nbBatches++ if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) + add(queue[len(queue)-1]) queue = queue[:len(queue)-1] } + // processQueue() } } else { // put it in queue. @@ -83,14 +172,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() + // executeAndReset() for len(queue) != 0 { - queue = processQueueG1Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. + processQueue() + executeAndReset() // execute batch even if not full. } // flush items in batch. - batch.ExecuteAndReset() + executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -145,162 +234,144 @@ type ibG1Affine interface { bucketG1AffineC21 } -type BatchG1Affine[B ibG1Affine] struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G1Affine - buckets *B -} +// processChunkG2BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. +// +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + digits []uint32) { + + // init the buckets + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } -func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { - batchSize := len(*buckets) / 5 + // setup for the batch affine; + batchSize := len(buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG1Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} + bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + cptP := 0 // count the number of point added to current batch -func (b *BatchG1Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} + var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack + var R [MAX_BATCH_SIZE]*G2Affine // ... -func (b *BatchG1Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return + canAdd := func(bID uint32) bool { + _, ok := bucketIds[bID] + return !ok } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} -func (b *BatchG1Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return + isFull := func() bool { + return cptP == batchSize } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() + + executeAndReset := func() { + if cptP == 0 { return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return + BatchAddG2Affine(R[:cptP], P[:cptP], cptP) + for k := range bucketIds { + delete(bucketIds, k) } + cptP = 0 } - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} + add := func(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch -func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() + BK := &buckets[op.bucketID] + PP := &points[op.pointID>>1] + if PP.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(PP) + } else { + BK.Set(PP) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(PP) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { + BK.setInfinity() + return } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] } - } - return queue -} + // bucketIds[cptP] = op.bucketID + bucketIds[op.bucketID] = struct{}{} + R[cptP] = BK + if op.isNeg() { + P[cptP].Neg(PP) + } else { + P[cptP].Set(PP) + } + cptP++ + } -// processChunkG2BatchAffine process a chunk of the scalars during the msm -// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition -// we use a batch affine addition. -// -// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 -// See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - pscalars []uint32) { + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() + processQueue := func() { + // for i := len(queue) - 1; i >= 0; i-- { + for i := 0; i < len(queue); i++ { + if canAdd(queue[i].bucketID) { + add(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + i-- + } + } } - batch := newBatchG2Affine(&buckets, points) - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] + for i, digit := range digits { - if bits == 0 { + if digit == 0 { continue } op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - op.bucketID = uint32((bits >> 1) - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + op.bucketID = uint32((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((bits >> 1))) + op.bucketID = (uint32((digit >> 1))) op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() + if canAdd(op.bucketID) { + add(op) + if isFull() { + executeAndReset() nbBatches++ if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) + add(queue[len(queue)-1]) queue = queue[:len(queue)-1] } + // processQueue() } } else { // put it in queue. @@ -309,14 +380,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() + // executeAndReset() for len(queue) != 0 { - queue = processQueueG2Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. + processQueue() + executeAndReset() // execute batch even if not full. } // flush items in batch. - batch.ExecuteAndReset() + executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -370,110 +441,3 @@ type ibG2Affine interface { bucketG2AffineC20 | bucketG2AffineC21 } - -type BatchG2Affine[B ibG2Affine] struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G2Affine - buckets *B -} - -func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { - batchSize := len(*buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG2Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} - -func (b *BatchG2Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG2Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG2Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go index be1b9a8b65..6663cc9e73 100644 --- a/ecc/bls24-315/multiexp_jacobian.go +++ b/ecc/bls24-315/multiexp_jacobian.go @@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - pscalars []uint32) { + digits []uint32) { var buckets B for i := 0; i < len(buckets); i++ { @@ -28,20 +28,18 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, } // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] - - if bits == 0 { + for i, digit := range digits { + if digit == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - buckets[(bits>>1)-1].addMixed(&points[i]) + buckets[(digit>>1)-1].addMixed(&points[i]) } else { // sub - buckets[(bits >> 1)].subMixed(&points[i]) + buckets[(digit >> 1)].subMixed(&points[i]) } } @@ -105,7 +103,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - pscalars []uint32) { + digits []uint32) { var buckets B for i := 0; i < len(buckets); i++ { @@ -113,20 +111,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, } // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] - - if bits == 0 { + for i, digit := range digits { + if digit == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - buckets[(bits>>1)-1].addMixed(&points[i]) + buckets[(digit>>1)-1].addMixed(&points[i]) } else { // sub - buckets[(bits >> 1)].subMixed(&points[i]) + buckets[(digit >> 1)].subMixed(&points[i]) } } diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index 352e8122b6..1b697194a5 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 11; i <= pow; i++ { + for i := 14; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { @@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 11; i <= pow; i++ { + for i := 14; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index f2c3d767a1..5cb36f1788 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -118,12 +118,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG1(p, int(C), points, pscalars, splitFirstChunk) + innerMsmG1(p, int(C), points, digits, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. // _p := make([]G1Jac, nbSplits - 1) @@ -146,73 +146,73 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) { +func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG1BatchAffine[bucketG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - _innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG1BatchAffine[bucketG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG1BatchAffine[bucketG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG1BatchAffine[bucketG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - _innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG1BatchAffine[bucketG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG1BatchAffine[bucketG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG1BatchAffine[bucketG1AffineC20] processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG1BatchAffine[bucketG1AffineC21] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -232,10 +232,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -243,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, pscalars[:n]) + go processChunk(0, chChunks[0], c, points, digits[:n]) } else { chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], pscalars[:split]) - go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) + go processChunk(0, chSplit, c, points[:split], digits[:split]) + go processChunk(0, chSplit, c, points[split:], digits[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -372,12 +372,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG2(p, int(C), points, pscalars, splitFirstChunk) + innerMsmG2(p, int(C), points, digits, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. // _p := make([]G2Jac, nbSplits - 1) @@ -400,73 +400,73 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) { +func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG2BatchAffine[bucketG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - _innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG2BatchAffine[bucketG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG2BatchAffine[bucketG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG2BatchAffine[bucketG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - _innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG2BatchAffine[bucketG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG2BatchAffine[bucketG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG2BatchAffine[bucketG2AffineC20] processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG2BatchAffine[bucketG2AffineC21] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -486,10 +486,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -497,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, pscalars[:n]) + go processChunk(0, chChunks[0], c, points, digits[:n]) } else { chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], pscalars[:split]) - go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) + go processChunk(0, chSplit, c, points[:split], digits[:split]) + go processChunk(0, chSplit, c, points[split:], digits[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index 2f19119cc6..2833b83137 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -36,45 +36,134 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - pscalars []uint32) { + digits []uint32) { + // init the buckets var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - batch := newBatchG1Affine(&buckets, points) + // setup for the batch affine; + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + cptP := 0 // count the number of point added to current batch + + var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack + var R [MAX_BATCH_SIZE]*G1Affine // ... + + canAdd := func(bID uint32) bool { + _, ok := bucketIds[bID] + return !ok + } + + isFull := func() bool { + return cptP == batchSize + } + + executeAndReset := func() { + if cptP == 0 { + return + } + BatchAddG1Affine(R[:cptP], P[:cptP], cptP) + for k := range bucketIds { + delete(bucketIds, k) + } + cptP = 0 + } + + add := func(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[op.bucketID] + PP := &points[op.pointID>>1] + if PP.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(PP) + } else { + BK.Set(PP) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(PP) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { + BK.setInfinity() + return + } + } + + // bucketIds[cptP] = op.bucketID + bucketIds[op.bucketID] = struct{}{} + R[cptP] = BK + if op.isNeg() { + P[cptP].Neg(PP) + } else { + P[cptP].Set(PP) + } + cptP++ + } + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + + processQueue := func() { + // for i := len(queue) - 1; i >= 0; i-- { + for i := 0; i < len(queue); i++ { + if canAdd(queue[i].bucketID) { + add(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + i-- + } + } + } + nbBatches := 0 - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] + for i, digit := range digits { - if bits == 0 { + if digit == 0 { continue } op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - op.bucketID = uint32((bits >> 1) - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + op.bucketID = uint32((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((bits >> 1))) + op.bucketID = (uint32((digit >> 1))) op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() + if canAdd(op.bucketID) { + add(op) + if isFull() { + executeAndReset() nbBatches++ if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) + add(queue[len(queue)-1]) queue = queue[:len(queue)-1] } + // processQueue() } } else { // put it in queue. @@ -83,14 +172,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() + // executeAndReset() for len(queue) != 0 { - queue = processQueueG1Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. + processQueue() + executeAndReset() // execute batch even if not full. } // flush items in batch. - batch.ExecuteAndReset() + executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -145,162 +234,144 @@ type ibG1Affine interface { bucketG1AffineC21 } -type BatchG1Affine[B ibG1Affine] struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G1Affine - buckets *B -} +// processChunkG2BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. +// +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + digits []uint32) { + + // init the buckets + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } -func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { - batchSize := len(*buckets) / 5 + // setup for the batch affine; + batchSize := len(buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG1Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} + bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + cptP := 0 // count the number of point added to current batch -func (b *BatchG1Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} + var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack + var R [MAX_BATCH_SIZE]*G2Affine // ... -func (b *BatchG1Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return + canAdd := func(bID uint32) bool { + _, ok := bucketIds[bID] + return !ok } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} -func (b *BatchG1Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return + isFull := func() bool { + return cptP == batchSize } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() + + executeAndReset := func() { + if cptP == 0 { return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return + BatchAddG2Affine(R[:cptP], P[:cptP], cptP) + for k := range bucketIds { + delete(bucketIds, k) } + cptP = 0 } - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} + add := func(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch -func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() + BK := &buckets[op.bucketID] + PP := &points[op.pointID>>1] + if PP.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(PP) + } else { + BK.Set(PP) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(PP) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { + BK.setInfinity() + return } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] } - } - return queue -} + // bucketIds[cptP] = op.bucketID + bucketIds[op.bucketID] = struct{}{} + R[cptP] = BK + if op.isNeg() { + P[cptP].Neg(PP) + } else { + P[cptP].Set(PP) + } + cptP++ + } -// processChunkG2BatchAffine process a chunk of the scalars during the msm -// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition -// we use a batch affine addition. -// -// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 -// See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - pscalars []uint32) { + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() + processQueue := func() { + // for i := len(queue) - 1; i >= 0; i-- { + for i := 0; i < len(queue); i++ { + if canAdd(queue[i].bucketID) { + add(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + i-- + } + } } - batch := newBatchG2Affine(&buckets, points) - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] + for i, digit := range digits { - if bits == 0 { + if digit == 0 { continue } op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - op.bucketID = uint32((bits >> 1) - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + op.bucketID = uint32((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((bits >> 1))) + op.bucketID = (uint32((digit >> 1))) op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() + if canAdd(op.bucketID) { + add(op) + if isFull() { + executeAndReset() nbBatches++ if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) + add(queue[len(queue)-1]) queue = queue[:len(queue)-1] } + // processQueue() } } else { // put it in queue. @@ -309,14 +380,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() + // executeAndReset() for len(queue) != 0 { - queue = processQueueG2Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. + processQueue() + executeAndReset() // execute batch even if not full. } // flush items in batch. - batch.ExecuteAndReset() + executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -370,110 +441,3 @@ type ibG2Affine interface { bucketG2AffineC20 | bucketG2AffineC21 } - -type BatchG2Affine[B ibG2Affine] struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G2Affine - buckets *B -} - -func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { - batchSize := len(*buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG2Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} - -func (b *BatchG2Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG2Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG2Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go index a79434b537..fccf3e949d 100644 --- a/ecc/bls24-317/multiexp_jacobian.go +++ b/ecc/bls24-317/multiexp_jacobian.go @@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - pscalars []uint32) { + digits []uint32) { var buckets B for i := 0; i < len(buckets); i++ { @@ -28,20 +28,18 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, } // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] - - if bits == 0 { + for i, digit := range digits { + if digit == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - buckets[(bits>>1)-1].addMixed(&points[i]) + buckets[(digit>>1)-1].addMixed(&points[i]) } else { // sub - buckets[(bits >> 1)].subMixed(&points[i]) + buckets[(digit >> 1)].subMixed(&points[i]) } } @@ -105,7 +103,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - pscalars []uint32) { + digits []uint32) { var buckets B for i := 0; i < len(buckets); i++ { @@ -113,20 +111,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, } // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] - - if bits == 0 { + for i, digit := range digits { + if digit == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - buckets[(bits>>1)-1].addMixed(&points[i]) + buckets[(digit>>1)-1].addMixed(&points[i]) } else { // sub - buckets[(bits >> 1)].subMixed(&points[i]) + buckets[(digit >> 1)].subMixed(&points[i]) } } diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index feb0b0c87f..eb190a317b 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 11; i <= pow; i++ { + for i := 14; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { @@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 11; i <= pow; i++ { + for i := 14; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index 19b6fd8496..e3c84b390f 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -118,12 +118,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG1(p, int(C), points, pscalars, splitFirstChunk) + innerMsmG1(p, int(C), points, digits, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. // _p := make([]G1Jac, nbSplits - 1) @@ -146,73 +146,73 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) { +func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG1BatchAffine[bucketG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - _innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG1BatchAffine[bucketG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG1BatchAffine[bucketG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG1BatchAffine[bucketG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - _innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG1BatchAffine[bucketG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG1BatchAffine[bucketG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG1BatchAffine[bucketG1AffineC20] processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG1BatchAffine[bucketG1AffineC21] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -232,10 +232,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -243,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, pscalars[:n]) + go processChunk(0, chChunks[0], c, points, digits[:n]) } else { chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], pscalars[:split]) - go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) + go processChunk(0, chSplit, c, points[:split], digits[:split]) + go processChunk(0, chSplit, c, points[split:], digits[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -372,12 +372,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG2(p, int(C), points, pscalars, splitFirstChunk) + innerMsmG2(p, int(C), points, digits, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. // _p := make([]G2Jac, nbSplits - 1) @@ -400,73 +400,73 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) { +func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 9: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: processChunk := processChunkG2BatchAffine[bucketG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - _innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: processChunk := processChunkG2BatchAffine[bucketG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: processChunk := processChunkG2BatchAffine[bucketG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: processChunk := processChunkG2BatchAffine[bucketG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - _innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: processChunk := processChunkG2BatchAffine[bucketG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG2BatchAffine[bucketG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) case 20: processChunk := processChunkG2BatchAffine[bucketG2AffineC20] processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) case 21: processChunk := processChunkG2BatchAffine[bucketG2AffineC21] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -486,10 +486,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -497,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, pscalars[:n]) + go processChunk(0, chChunks[0], c, points, digits[:n]) } else { chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], pscalars[:split]) - go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) + go processChunk(0, chSplit, c, points[:split], digits[:split]) + go processChunk(0, chSplit, c, points[split:], digits[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index 0028bf14dd..5fea03fa0e 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -36,45 +36,134 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - pscalars []uint32) { + digits []uint32) { + // init the buckets var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - batch := newBatchG1Affine(&buckets, points) + // setup for the batch affine; + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + cptP := 0 // count the number of point added to current batch + + var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack + var R [MAX_BATCH_SIZE]*G1Affine // ... + + canAdd := func(bID uint32) bool { + _, ok := bucketIds[bID] + return !ok + } + + isFull := func() bool { + return cptP == batchSize + } + + executeAndReset := func() { + if cptP == 0 { + return + } + BatchAddG1Affine(R[:cptP], P[:cptP], cptP) + for k := range bucketIds { + delete(bucketIds, k) + } + cptP = 0 + } + + add := func(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[op.bucketID] + PP := &points[op.pointID>>1] + if PP.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(PP) + } else { + BK.Set(PP) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(PP) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { + BK.setInfinity() + return + } + } + + // bucketIds[cptP] = op.bucketID + bucketIds[op.bucketID] = struct{}{} + R[cptP] = BK + if op.isNeg() { + P[cptP].Neg(PP) + } else { + P[cptP].Set(PP) + } + cptP++ + } + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + + processQueue := func() { + // for i := len(queue) - 1; i >= 0; i-- { + for i := 0; i < len(queue); i++ { + if canAdd(queue[i].bucketID) { + add(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + i-- + } + } + } + nbBatches := 0 - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] + for i, digit := range digits { - if bits == 0 { + if digit == 0 { continue } op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - op.bucketID = uint32((bits >> 1) - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + op.bucketID = uint32((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((bits >> 1))) + op.bucketID = (uint32((digit >> 1))) op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() + if canAdd(op.bucketID) { + add(op) + if isFull() { + executeAndReset() nbBatches++ if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) + add(queue[len(queue)-1]) queue = queue[:len(queue)-1] } + // processQueue() } } else { // put it in queue. @@ -83,14 +172,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() + // executeAndReset() for len(queue) != 0 { - queue = processQueueG1Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. + processQueue() + executeAndReset() // execute batch even if not full. } // flush items in batch. - batch.ExecuteAndReset() + executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -145,162 +234,144 @@ type ibG1Affine interface { bucketG1AffineC21 } -type BatchG1Affine[B ibG1Affine] struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G1Affine - buckets *B -} +// processChunkG2BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. +// +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + digits []uint32) { + + // init the buckets + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } -func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { - batchSize := len(*buckets) / 5 + // setup for the batch affine; + batchSize := len(buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG1Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} + bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + cptP := 0 // count the number of point added to current batch -func (b *BatchG1Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} + var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack + var R [MAX_BATCH_SIZE]*G2Affine // ... -func (b *BatchG1Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return + canAdd := func(bID uint32) bool { + _, ok := bucketIds[bID] + return !ok } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} -func (b *BatchG1Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return + isFull := func() bool { + return cptP == batchSize } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() + + executeAndReset := func() { + if cptP == 0 { return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return + BatchAddG2Affine(R[:cptP], P[:cptP], cptP) + for k := range bucketIds { + delete(bucketIds, k) } + cptP = 0 } - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} + add := func(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch -func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() + BK := &buckets[op.bucketID] + PP := &points[op.pointID>>1] + if PP.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(PP) + } else { + BK.Set(PP) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(PP) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { + BK.setInfinity() + return } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] } - } - return queue -} + // bucketIds[cptP] = op.bucketID + bucketIds[op.bucketID] = struct{}{} + R[cptP] = BK + if op.isNeg() { + P[cptP].Neg(PP) + } else { + P[cptP].Set(PP) + } + cptP++ + } -// processChunkG2BatchAffine process a chunk of the scalars during the msm -// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition -// we use a batch affine addition. -// -// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 -// See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - pscalars []uint32) { + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() + processQueue := func() { + // for i := len(queue) - 1; i >= 0; i-- { + for i := 0; i < len(queue); i++ { + if canAdd(queue[i].bucketID) { + add(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + i-- + } + } } - batch := newBatchG2Affine(&buckets, points) - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] + for i, digit := range digits { - if bits == 0 { + if digit == 0 { continue } op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - op.bucketID = uint32((bits >> 1) - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + op.bucketID = uint32((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((bits >> 1))) + op.bucketID = (uint32((digit >> 1))) op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() + if canAdd(op.bucketID) { + add(op) + if isFull() { + executeAndReset() nbBatches++ if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) + add(queue[len(queue)-1]) queue = queue[:len(queue)-1] } + // processQueue() } } else { // put it in queue. @@ -309,14 +380,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() + // executeAndReset() for len(queue) != 0 { - queue = processQueueG2Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. + processQueue() + executeAndReset() // execute batch even if not full. } // flush items in batch. - batch.ExecuteAndReset() + executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -370,110 +441,3 @@ type ibG2Affine interface { bucketG2AffineC20 | bucketG2AffineC21 } - -type BatchG2Affine[B ibG2Affine] struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G2Affine - buckets *B -} - -func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { - batchSize := len(*buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG2Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} - -func (b *BatchG2Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG2Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG2Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go index 2b0db816d2..ef34f5faad 100644 --- a/ecc/bn254/multiexp_jacobian.go +++ b/ecc/bn254/multiexp_jacobian.go @@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - pscalars []uint32) { + digits []uint32) { var buckets B for i := 0; i < len(buckets); i++ { @@ -28,20 +28,18 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, } // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] - - if bits == 0 { + for i, digit := range digits { + if digit == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - buckets[(bits>>1)-1].addMixed(&points[i]) + buckets[(digit>>1)-1].addMixed(&points[i]) } else { // sub - buckets[(bits >> 1)].subMixed(&points[i]) + buckets[(digit >> 1)].subMixed(&points[i]) } } @@ -105,7 +103,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - pscalars []uint32) { + digits []uint32) { var buckets B for i := 0; i < len(buckets); i++ { @@ -113,20 +111,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, } // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] - - if bits == 0 { + for i, digit := range digits { + if digit == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - buckets[(bits>>1)-1].addMixed(&points[i]) + buckets[(digit>>1)-1].addMixed(&points[i]) } else { // sub - buckets[(bits >> 1)].subMixed(&points[i]) + buckets[(digit >> 1)].subMixed(&points[i]) } } diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index a9df7af3fa..05d133cdbe 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 11; i <= pow; i++ { + for i := 14; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { @@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 11; i <= pow; i++ { + for i := 14; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index 9c168de3c5..b8fdb1314f 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -118,12 +118,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG1(p, int(C), points, pscalars, splitFirstChunk) + innerMsmG1(p, int(C), points, digits, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. // _p := make([]G1Jac, nbSplits - 1) @@ -146,28 +146,28 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) { +func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] - _innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -187,10 +187,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -198,12 +198,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, pscalars[:n]) + go processChunk(0, chChunks[0], c, points, digits[:n]) } else { chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], pscalars[:split]) - go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) + go processChunk(0, chSplit, c, points[:split], digits[:split]) + go processChunk(0, chSplit, c, points[split:], digits[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -327,12 +327,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG2(p, int(C), points, pscalars, splitFirstChunk) + innerMsmG2(p, int(C), points, digits, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. // _p := make([]G2Jac, nbSplits - 1) @@ -355,28 +355,28 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) { +func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] - _innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -396,10 +396,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -407,12 +407,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, pscalars[:n]) + go processChunk(0, chChunks[0], c, points, digits[:n]) } else { chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], pscalars[:split]) - go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) + go processChunk(0, chSplit, c, points[:split], digits[:split]) + go processChunk(0, chSplit, c, points[split:], digits[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index 1cc44fcaa6..8305011734 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -36,45 +36,134 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - pscalars []uint32) { + digits []uint32) { + // init the buckets var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - batch := newBatchG1Affine(&buckets, points) + // setup for the batch affine; + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + cptP := 0 // count the number of point added to current batch + + var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack + var R [MAX_BATCH_SIZE]*G1Affine // ... + + canAdd := func(bID uint32) bool { + _, ok := bucketIds[bID] + return !ok + } + + isFull := func() bool { + return cptP == batchSize + } + + executeAndReset := func() { + if cptP == 0 { + return + } + BatchAddG1Affine(R[:cptP], P[:cptP], cptP) + for k := range bucketIds { + delete(bucketIds, k) + } + cptP = 0 + } + + add := func(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[op.bucketID] + PP := &points[op.pointID>>1] + if PP.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(PP) + } else { + BK.Set(PP) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(PP) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { + BK.setInfinity() + return + } + } + + // bucketIds[cptP] = op.bucketID + bucketIds[op.bucketID] = struct{}{} + R[cptP] = BK + if op.isNeg() { + P[cptP].Neg(PP) + } else { + P[cptP].Set(PP) + } + cptP++ + } + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + + processQueue := func() { + // for i := len(queue) - 1; i >= 0; i-- { + for i := 0; i < len(queue); i++ { + if canAdd(queue[i].bucketID) { + add(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + i-- + } + } + } + nbBatches := 0 - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] + for i, digit := range digits { - if bits == 0 { + if digit == 0 { continue } op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - op.bucketID = uint32((bits >> 1) - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + op.bucketID = uint32((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((bits >> 1))) + op.bucketID = (uint32((digit >> 1))) op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() + if canAdd(op.bucketID) { + add(op) + if isFull() { + executeAndReset() nbBatches++ if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) + add(queue[len(queue)-1]) queue = queue[:len(queue)-1] } + // processQueue() } } else { // put it in queue. @@ -83,14 +172,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() + // executeAndReset() for len(queue) != 0 { - queue = processQueueG1Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. + processQueue() + executeAndReset() // execute batch even if not full. } // flush items in batch. - batch.ExecuteAndReset() + executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -123,162 +212,144 @@ type ibG1Affine interface { bucketG1AffineC16 } -type BatchG1Affine[B ibG1Affine] struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G1Affine - buckets *B -} +// processChunkG2BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. +// +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + digits []uint32) { + + // init the buckets + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } -func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { - batchSize := len(*buckets) / 5 + // setup for the batch affine; + batchSize := len(buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG1Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} + bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + cptP := 0 // count the number of point added to current batch -func (b *BatchG1Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} + var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack + var R [MAX_BATCH_SIZE]*G2Affine // ... -func (b *BatchG1Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return + canAdd := func(bID uint32) bool { + _, ok := bucketIds[bID] + return !ok } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} -func (b *BatchG1Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return + isFull := func() bool { + return cptP == batchSize } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() + + executeAndReset := func() { + if cptP == 0 { return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return + BatchAddG2Affine(R[:cptP], P[:cptP], cptP) + for k := range bucketIds { + delete(bucketIds, k) } + cptP = 0 } - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} + add := func(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch -func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() + BK := &buckets[op.bucketID] + PP := &points[op.pointID>>1] + if PP.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(PP) + } else { + BK.Set(PP) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(PP) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { + BK.setInfinity() + return } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] } - } - return queue -} + // bucketIds[cptP] = op.bucketID + bucketIds[op.bucketID] = struct{}{} + R[cptP] = BK + if op.isNeg() { + P[cptP].Neg(PP) + } else { + P[cptP].Set(PP) + } + cptP++ + } -// processChunkG2BatchAffine process a chunk of the scalars during the msm -// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition -// we use a batch affine addition. -// -// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 -// See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - pscalars []uint32) { + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() + processQueue := func() { + // for i := len(queue) - 1; i >= 0; i-- { + for i := 0; i < len(queue); i++ { + if canAdd(queue[i].bucketID) { + add(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + i-- + } + } } - batch := newBatchG2Affine(&buckets, points) - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] + for i, digit := range digits { - if bits == 0 { + if digit == 0 { continue } op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - op.bucketID = uint32((bits >> 1) - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + op.bucketID = uint32((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((bits >> 1))) + op.bucketID = (uint32((digit >> 1))) op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() + if canAdd(op.bucketID) { + add(op) + if isFull() { + executeAndReset() nbBatches++ if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) + add(queue[len(queue)-1]) queue = queue[:len(queue)-1] } + // processQueue() } } else { // put it in queue. @@ -287,14 +358,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() + // executeAndReset() for len(queue) != 0 { - queue = processQueueG2Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. + processQueue() + executeAndReset() // execute batch even if not full. } // flush items in batch. - batch.ExecuteAndReset() + executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -326,110 +397,3 @@ type ibG2Affine interface { bucketG2AffineC8 | bucketG2AffineC16 } - -type BatchG2Affine[B ibG2Affine] struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G2Affine - buckets *B -} - -func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { - batchSize := len(*buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG2Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} - -func (b *BatchG2Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG2Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG2Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go index 996352830f..29756cc499 100644 --- a/ecc/bw6-633/multiexp_jacobian.go +++ b/ecc/bw6-633/multiexp_jacobian.go @@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - pscalars []uint32) { + digits []uint32) { var buckets B for i := 0; i < len(buckets); i++ { @@ -28,20 +28,18 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, } // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] - - if bits == 0 { + for i, digit := range digits { + if digit == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - buckets[(bits>>1)-1].addMixed(&points[i]) + buckets[(digit>>1)-1].addMixed(&points[i]) } else { // sub - buckets[(bits >> 1)].subMixed(&points[i]) + buckets[(digit >> 1)].subMixed(&points[i]) } } @@ -79,7 +77,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - pscalars []uint32) { + digits []uint32) { var buckets B for i := 0; i < len(buckets); i++ { @@ -87,20 +85,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, } // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] - - if bits == 0 { + for i, digit := range digits { + if digit == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - buckets[(bits>>1)-1].addMixed(&points[i]) + buckets[(digit>>1)-1].addMixed(&points[i]) } else { // sub - buckets[(bits >> 1)].subMixed(&points[i]) + buckets[(digit >> 1)].subMixed(&points[i]) } } diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index 5fd9a64f32..75e1904bf0 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 11; i <= pow; i++ { + for i := 14; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { @@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 11; i <= pow; i++ { + for i := 14; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index 04a630a92a..98b3867477 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -118,12 +118,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG1(p, int(C), points, pscalars, splitFirstChunk) + innerMsmG1(p, int(C), points, digits, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. // _p := make([]G1Jac, nbSplits - 1) @@ -146,29 +146,29 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) { +func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -188,10 +188,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -199,12 +199,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, pscalars[:n]) + go processChunk(0, chChunks[0], c, points, digits[:n]) } else { chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], pscalars[:split]) - go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) + go processChunk(0, chSplit, c, points[:split], digits[:split]) + go processChunk(0, chSplit, c, points[split:], digits[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -328,12 +328,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG2(p, int(C), points, pscalars, splitFirstChunk) + innerMsmG2(p, int(C), points, digits, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. // _p := make([]G2Jac, nbSplits - 1) @@ -356,29 +356,29 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) { +func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -398,10 +398,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -409,12 +409,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, pscalars[:n]) + go processChunk(0, chChunks[0], c, points, digits[:n]) } else { chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], pscalars[:split]) - go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) + go processChunk(0, chSplit, c, points[:split], digits[:split]) + go processChunk(0, chSplit, c, points[split:], digits[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index 2a439173b9..8b01509015 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -36,45 +36,134 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - pscalars []uint32) { + digits []uint32) { + // init the buckets var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - batch := newBatchG1Affine(&buckets, points) + // setup for the batch affine; + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + cptP := 0 // count the number of point added to current batch + + var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack + var R [MAX_BATCH_SIZE]*G1Affine // ... + + canAdd := func(bID uint32) bool { + _, ok := bucketIds[bID] + return !ok + } + + isFull := func() bool { + return cptP == batchSize + } + + executeAndReset := func() { + if cptP == 0 { + return + } + BatchAddG1Affine(R[:cptP], P[:cptP], cptP) + for k := range bucketIds { + delete(bucketIds, k) + } + cptP = 0 + } + + add := func(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[op.bucketID] + PP := &points[op.pointID>>1] + if PP.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(PP) + } else { + BK.Set(PP) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(PP) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { + BK.setInfinity() + return + } + } + + // bucketIds[cptP] = op.bucketID + bucketIds[op.bucketID] = struct{}{} + R[cptP] = BK + if op.isNeg() { + P[cptP].Neg(PP) + } else { + P[cptP].Set(PP) + } + cptP++ + } + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + + processQueue := func() { + // for i := len(queue) - 1; i >= 0; i-- { + for i := 0; i < len(queue); i++ { + if canAdd(queue[i].bucketID) { + add(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + i-- + } + } + } + nbBatches := 0 - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] + for i, digit := range digits { - if bits == 0 { + if digit == 0 { continue } op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - op.bucketID = uint32((bits >> 1) - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + op.bucketID = uint32((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((bits >> 1))) + op.bucketID = (uint32((digit >> 1))) op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() + if canAdd(op.bucketID) { + add(op) + if isFull() { + executeAndReset() nbBatches++ if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) + add(queue[len(queue)-1]) queue = queue[:len(queue)-1] } + // processQueue() } } else { // put it in queue. @@ -83,14 +172,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() + // executeAndReset() for len(queue) != 0 { - queue = processQueueG1Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. + processQueue() + executeAndReset() // execute batch even if not full. } // flush items in batch. - batch.ExecuteAndReset() + executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -123,162 +212,144 @@ type ibG1Affine interface { bucketG1AffineC16 } -type BatchG1Affine[B ibG1Affine] struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G1Affine - buckets *B -} +// processChunkG2BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. +// +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + digits []uint32) { + + // init the buckets + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } -func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { - batchSize := len(*buckets) / 5 + // setup for the batch affine; + batchSize := len(buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG1Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} + bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + cptP := 0 // count the number of point added to current batch -func (b *BatchG1Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} + var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack + var R [MAX_BATCH_SIZE]*G2Affine // ... -func (b *BatchG1Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return + canAdd := func(bID uint32) bool { + _, ok := bucketIds[bID] + return !ok } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} -func (b *BatchG1Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return + isFull := func() bool { + return cptP == batchSize } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() + + executeAndReset := func() { + if cptP == 0 { return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return + BatchAddG2Affine(R[:cptP], P[:cptP], cptP) + for k := range bucketIds { + delete(bucketIds, k) } + cptP = 0 } - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} + add := func(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch -func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() + BK := &buckets[op.bucketID] + PP := &points[op.pointID>>1] + if PP.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(PP) + } else { + BK.Set(PP) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(PP) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { + BK.setInfinity() + return } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] } - } - return queue -} + // bucketIds[cptP] = op.bucketID + bucketIds[op.bucketID] = struct{}{} + R[cptP] = BK + if op.isNeg() { + P[cptP].Neg(PP) + } else { + P[cptP].Set(PP) + } + cptP++ + } -// processChunkG2BatchAffine process a chunk of the scalars during the msm -// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition -// we use a batch affine addition. -// -// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 -// See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - pscalars []uint32) { + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() + processQueue := func() { + // for i := len(queue) - 1; i >= 0; i-- { + for i := 0; i < len(queue); i++ { + if canAdd(queue[i].bucketID) { + add(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + i-- + } + } } - batch := newBatchG2Affine(&buckets, points) - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] + for i, digit := range digits { - if bits == 0 { + if digit == 0 { continue } op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - op.bucketID = uint32((bits >> 1) - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + op.bucketID = uint32((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((bits >> 1))) + op.bucketID = (uint32((digit >> 1))) op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() + if canAdd(op.bucketID) { + add(op) + if isFull() { + executeAndReset() nbBatches++ if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) + add(queue[len(queue)-1]) queue = queue[:len(queue)-1] } + // processQueue() } } else { // put it in queue. @@ -287,14 +358,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() + // executeAndReset() for len(queue) != 0 { - queue = processQueueG2Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. + processQueue() + executeAndReset() // execute batch even if not full. } // flush items in batch. - batch.ExecuteAndReset() + executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -326,110 +397,3 @@ type ibG2Affine interface { bucketG2AffineC8 | bucketG2AffineC16 } - -type BatchG2Affine[B ibG2Affine] struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G2Affine - buckets *B -} - -func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { - batchSize := len(*buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG2Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} - -func (b *BatchG2Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG2Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG2Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} diff --git a/ecc/bw6-756/multiexp_jacobian.go b/ecc/bw6-756/multiexp_jacobian.go index 1f7ec4b3f8..10a354ae58 100644 --- a/ecc/bw6-756/multiexp_jacobian.go +++ b/ecc/bw6-756/multiexp_jacobian.go @@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - pscalars []uint32) { + digits []uint32) { var buckets B for i := 0; i < len(buckets); i++ { @@ -28,20 +28,18 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, } // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] - - if bits == 0 { + for i, digit := range digits { + if digit == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - buckets[(bits>>1)-1].addMixed(&points[i]) + buckets[(digit>>1)-1].addMixed(&points[i]) } else { // sub - buckets[(bits >> 1)].subMixed(&points[i]) + buckets[(digit >> 1)].subMixed(&points[i]) } } @@ -79,7 +77,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - pscalars []uint32) { + digits []uint32) { var buckets B for i := 0; i < len(buckets); i++ { @@ -87,20 +85,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, } // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] - - if bits == 0 { + for i, digit := range digits { + if digit == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - buckets[(bits>>1)-1].addMixed(&points[i]) + buckets[(digit>>1)-1].addMixed(&points[i]) } else { // sub - buckets[(bits >> 1)].subMixed(&points[i]) + buckets[(digit >> 1)].subMixed(&points[i]) } } diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index c8ddd47a9f..c37051d70d 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 11; i <= pow; i++ { + for i := 14; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { @@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 11; i <= pow; i++ { + for i := 14; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index b0fb81e0e6..9a41a9176f 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -118,12 +118,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG1(p, int(C), points, pscalars, splitFirstChunk) + innerMsmG1(p, int(C), points, digits, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. // _p := make([]G1Jac, nbSplits - 1) @@ -146,29 +146,29 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) { +func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -188,10 +188,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -199,12 +199,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, pscalars[:n]) + go processChunk(0, chChunks[0], c, points, digits[:n]) } else { chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], pscalars[:split]) - go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) + go processChunk(0, chSplit, c, points[:split], digits[:split]) + go processChunk(0, chSplit, c, points[split:], digits[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit @@ -328,12 +328,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG2(p, int(C), points, pscalars, splitFirstChunk) + innerMsmG2(p, int(C), points, digits, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. // _p := make([]G2Jac, nbSplits - 1) @@ -356,29 +356,29 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) { +func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -398,10 +398,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -409,12 +409,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, pscalars[:n]) + go processChunk(0, chChunks[0], c, points, digits[:n]) } else { chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], pscalars[:split]) - go processChunk(0, chSplit, c, points[split:], pscalars[split:n]) + go processChunk(0, chSplit, c, points[:split], digits[:split]) + go processChunk(0, chSplit, c, points[split:], digits[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index 6fec5d8f63..6cdd72b7a4 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -36,45 +36,134 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - pscalars []uint32) { + digits []uint32) { + // init the buckets var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - batch := newBatchG1Affine(&buckets, points) + // setup for the batch affine; + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + cptP := 0 // count the number of point added to current batch + + var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack + var R [MAX_BATCH_SIZE]*G1Affine // ... + + canAdd := func(bID uint32) bool { + _, ok := bucketIds[bID] + return !ok + } + + isFull := func() bool { + return cptP == batchSize + } + + executeAndReset := func() { + if cptP == 0 { + return + } + BatchAddG1Affine(R[:cptP], P[:cptP], cptP) + for k := range bucketIds { + delete(bucketIds, k) + } + cptP = 0 + } + + add := func(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[op.bucketID] + PP := &points[op.pointID>>1] + if PP.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(PP) + } else { + BK.Set(PP) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(PP) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { + BK.setInfinity() + return + } + } + + // bucketIds[cptP] = op.bucketID + bucketIds[op.bucketID] = struct{}{} + R[cptP] = BK + if op.isNeg() { + P[cptP].Neg(PP) + } else { + P[cptP].Set(PP) + } + cptP++ + } + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + + processQueue := func() { + // for i := len(queue) - 1; i >= 0; i-- { + for i := 0; i < len(queue); i++ { + if canAdd(queue[i].bucketID) { + add(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + i-- + } + } + } + nbBatches := 0 - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] + for i, digit := range digits { - if bits == 0 { + if digit == 0 { continue } op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - op.bucketID = uint32((bits >> 1) - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + op.bucketID = uint32((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((bits >> 1))) + op.bucketID = (uint32((digit >> 1))) op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() + if canAdd(op.bucketID) { + add(op) + if isFull() { + executeAndReset() nbBatches++ if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) + add(queue[len(queue)-1]) queue = queue[:len(queue)-1] } + // processQueue() } } else { // put it in queue. @@ -83,14 +172,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() + // executeAndReset() for len(queue) != 0 { - queue = processQueueG1Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. + processQueue() + executeAndReset() // execute batch even if not full. } // flush items in batch. - batch.ExecuteAndReset() + executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -123,162 +212,144 @@ type ibG1Affine interface { bucketG1AffineC16 } -type BatchG1Affine[B ibG1Affine] struct { - P [MAX_BATCH_SIZE]G1Affine - R [MAX_BATCH_SIZE]*G1Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G1Affine - buckets *B -} +// processChunkG2BatchAffine process a chunk of the scalars during the msm +// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition +// we use a batch affine addition. +// +// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 +// See Section 5.3: ia.cr/2022/1396 +func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, + chRes chan<- g2JacExtended, + c uint64, + points []G2Affine, + digits []uint32) { + + // init the buckets + var buckets B + for i := 0; i < len(buckets); i++ { + buckets[i].setInfinity() + } -func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] { - batchSize := len(*buckets) / 5 + // setup for the batch affine; + batchSize := len(buckets) / 5 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - return BatchG1Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} + bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + cptP := 0 // count the number of point added to current batch -func (b *BatchG1Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} + var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack + var R [MAX_BATCH_SIZE]*G2Affine // ... -func (b *BatchG1Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return + canAdd := func(bID uint32) bool { + _, ok := bucketIds[bID] + return !ok } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} -func (b *BatchG1Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return + isFull := func() bool { + return cptP == batchSize } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() + + executeAndReset := func() { + if cptP == 0 { return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return + BatchAddG2Affine(R[:cptP], P[:cptP], cptP) + for k := range bucketIds { + delete(bucketIds, k) } + cptP = 0 } - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} + add := func(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch -func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() + BK := &buckets[op.bucketID] + PP := &points[op.pointID>>1] + if PP.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(PP) + } else { + BK.Set(PP) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(PP) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { + BK.setInfinity() + return } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] } - } - return queue -} + // bucketIds[cptP] = op.bucketID + bucketIds[op.bucketID] = struct{}{} + R[cptP] = BK + if op.isNeg() { + P[cptP].Neg(PP) + } else { + P[cptP].Set(PP) + } + cptP++ + } -// processChunkG2BatchAffine process a chunk of the scalars during the msm -// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition -// we use a batch affine addition. -// -// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 -// See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, - chRes chan<- g2JacExtended, - c uint64, - points []G2Affine, - pscalars []uint32) { + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. - var buckets B - for i := 0; i < len(buckets); i++ { - buckets[i].setInfinity() + processQueue := func() { + // for i := len(queue) - 1; i >= 0; i-- { + for i := 0; i < len(queue); i++ { + if canAdd(queue[i].bucketID) { + add(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + i-- + } + } } - batch := newBatchG2Affine(&buckets, points) - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. nbBatches := 0 - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] + for i, digit := range digits { - if bits == 0 { + if digit == 0 { continue } op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - op.bucketID = uint32((bits >> 1) - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + op.bucketID = uint32((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((bits >> 1))) + op.bucketID = (uint32((digit >> 1))) op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() + if canAdd(op.bucketID) { + add(op) + if isFull() { + executeAndReset() nbBatches++ if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) + add(queue[len(queue)-1]) queue = queue[:len(queue)-1] } + // processQueue() } } else { // put it in queue. @@ -287,14 +358,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() + // executeAndReset() for len(queue) != 0 { - queue = processQueueG2Affine(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. + processQueue() + executeAndReset() // execute batch even if not full. } // flush items in batch. - batch.ExecuteAndReset() + executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -326,110 +397,3 @@ type ibG2Affine interface { bucketG2AffineC8 | bucketG2AffineC16 } - -type BatchG2Affine[B ibG2Affine] struct { - P [MAX_BATCH_SIZE]G2Affine - R [MAX_BATCH_SIZE]*G2Affine - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []G2Affine - buckets *B -} - -func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] { - batchSize := len(*buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return BatchG2Affine[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} - -func (b *BatchG2Affine[B]) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *BatchG2Affine[B]) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *BatchG2Affine[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} diff --git a/ecc/bw6-761/multiexp_jacobian.go b/ecc/bw6-761/multiexp_jacobian.go index 48249ca28f..045bace5e7 100644 --- a/ecc/bw6-761/multiexp_jacobian.go +++ b/ecc/bw6-761/multiexp_jacobian.go @@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - pscalars []uint32) { + digits []uint32) { var buckets B for i := 0; i < len(buckets); i++ { @@ -28,20 +28,18 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, } // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] - - if bits == 0 { + for i, digit := range digits { + if digit == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - buckets[(bits>>1)-1].addMixed(&points[i]) + buckets[(digit>>1)-1].addMixed(&points[i]) } else { // sub - buckets[(bits >> 1)].subMixed(&points[i]) + buckets[(digit >> 1)].subMixed(&points[i]) } } @@ -79,7 +77,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - pscalars []uint32) { + digits []uint32) { var buckets B for i := 0; i < len(buckets); i++ { @@ -87,20 +85,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, } // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] - - if bits == 0 { + for i, digit := range digits { + if digit == 0 { continue } // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - buckets[(bits>>1)-1].addMixed(&points[i]) + buckets[(digit>>1)-1].addMixed(&points[i]) } else { // sub - buckets[(bits >> 1)].subMixed(&points[i]) + buckets[(digit >> 1)].subMixed(&points[i]) } } diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index cbcc319e1a..b0a94d79dd 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 11; i <= pow; i++ { + for i := 14; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { @@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 11; i <= pow; i++ { + for i := 14; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index bd4a489361..6dca2eb861 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -387,12 +387,12 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsm{{ $.UPointName }}(p, int(C), points, pscalars, splitFirstChunk) + innerMsm{{ $.UPointName }}(p, int(C), points, digits, splitFirstChunk) // we have nbSplits intermediate results that we must sum together. @@ -417,7 +417,7 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem } -func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, pscalars []uint32, splitFirstChunk bool) { +func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, digits []uint32, splitFirstChunk bool) { {{- /* TODO @gbotrel need to deal with cases where lastC == 1 ; having a whole chunk with 1-bit window makes no sense */}} {{- /* also need to determine until which window size the ext-jacobian version is worth it. */}} switch c { @@ -430,14 +430,14 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}] {{- end}} {{- if eq $c $lc}} - _innerMsm{{ $.UPointName }}(p, {{$c}}, points, pscalars, splitFirstChunk, processChunk, processChunk) + _innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processChunk) {{- else}} {{- if le $lc 9}} processLastChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{lastC $c}}] {{- else}} processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}] {{- end}} - _innerMsm{{ $.UPointName }}(p, {{$c}}, points, pscalars, splitFirstChunk, processChunk, processLastChunk) + _innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processLastChunk) {{- end}} {{- end}} default: @@ -445,8 +445,8 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi } } -func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, pscalars []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, pscalars []uint32)) *{{ $.TJacobian }} { +func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint32, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint32)) *{{ $.TJacobian }} { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -466,10 +466,10 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:]) + go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j >0; j-- { - go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n]) + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] @@ -477,12 +477,12 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. if !splitFirstChunk { - go processChunk(0,chChunks[0], c, points, pscalars[:n]) + go processChunk(0,chChunks[0], c, points, digits[:n]) } else { chSplit := make(chan {{ $.TJacobianExtended }}, 2) split := n / 2 - go processChunk(0,chSplit, c, points[:split], pscalars[:split]) - go processChunk(0,chSplit, c, points[split:], pscalars[split:n]) + go processChunk(0,chSplit, c, points[:split], digits[:split]) + go processChunk(0,chSplit, c, points[split:], digits[split:n]) go func() { s1 := <-chSplit s2 := <-chSplit diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index 518c7e3406..8fe8391308 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -7,6 +7,8 @@ {{ $G2TJacobianExtended := print (toLower .G2.PointName) "JacExtended" }} + + const MAX_BATCH_SIZE = 600 type batchOp struct { @@ -37,45 +39,136 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64 chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, - pscalars []uint32) { + digits []uint32) { + // init the buckets var buckets B for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() } - batch := newBatch{{ $.TAffine }}(&buckets, points) + // setup for the batch affine; + batchSize := len(buckets) / 5 + if batchSize > MAX_BATCH_SIZE { + batchSize = MAX_BATCH_SIZE + } + if batchSize <= 0 { + batchSize = 1 + } + bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + cptP := 0 // count the number of point added to current batch + + var P [MAX_BATCH_SIZE]{{ $.TAffine }} // allocated on the stack + var R [MAX_BATCH_SIZE]*{{ $.TAffine }} // ... + + canAdd := func(bID uint32) bool { + _, ok := bucketIds[bID] + return !ok + } + + isFull := func() bool { + return cptP == batchSize + } + + executeAndReset := func () { + if cptP == 0 { + return + } + BatchAdd{{ $.TAffine }}(R[:cptP], P[:cptP], cptP) + for k := range bucketIds { + delete(bucketIds, k) + } + cptP = 0 + } + + add := func(op batchOp) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[op.bucketID] + PP := &points[op.pointID>>1] + if PP.IsInfinity() { + return + } + // handle special cases with inf or -P / P + if BK.IsInfinity() { + if op.isNeg() { + BK.Neg(PP) + } else { + BK.Set(PP) + } + return + } + if op.isNeg() { + // if bucket == P --> -P == 0 + if BK.Equal(PP) { + BK.setInfinity() + return + } + } else { + // if bucket == -P, B == 0 + if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { + BK.setInfinity() + return + } + } + + // bucketIds[cptP] = op.bucketID + bucketIds[op.bucketID] = struct{}{} + R[cptP] = BK + if op.isNeg() { + P[cptP].Neg(PP) + } else { + P[cptP].Set(PP) + } + cptP++ + } + + queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + + + processQueue := func () { + // for i := len(queue) - 1; i >= 0; i-- { + for i := 0; i < len(queue); i++ { + if canAdd(queue[i].bucketID) { + add(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[len(queue)-1] + queue = queue[:len(queue)-1] + i-- + } + } + } + nbBatches := 0 - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] + for i, digit := range digits { - if bits == 0 { + if digit == 0 { continue } op := batchOp{pointID: uint32(i) << 1} // if msbWindow bit is set, we need to substract - if bits&1 == 0 { + if digit&1 == 0 { // add - op.bucketID = uint32((bits>>1) - 1) - // buckets[bits-1].Add(&points[i], &buckets[bits-1]) + op.bucketID = uint32((digit>>1) - 1) } else { // sub - op.bucketID = (uint32((bits>>1))) + op.bucketID = (uint32((digit>>1))) op.pointID += 1 - // op.isNeg = true - // buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i]) } - if batch.CanAdd(op.bucketID) { - batch.Add(op) - if batch.IsFull() { - batch.ExecuteAndReset() + if canAdd(op.bucketID) { + add(op) + if isFull() { + executeAndReset() nbBatches++ if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - batch.Add(queue[len(queue)-1]) + add(queue[len(queue)-1]) queue = queue[:len(queue)-1] } + // processQueue() } } else { // put it in queue. @@ -84,14 +177,14 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64 } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) - // batch.ExecuteAndReset() + // executeAndReset() for len(queue) != 0 { - queue = processQueue{{ $.TAffine }}(queue, &batch) - batch.ExecuteAndReset() // execute batch even if not full. + processQueue() + executeAndReset() // execute batch even if not full. } // flush items in batch. - batch.ExecuteAndReset() + executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -122,112 +215,4 @@ type ib{{ $.TAffine }} interface { {{- end}} } - -type Batch{{ $.TAffine }}[B ib{{ $.TAffine }}] struct { - P [MAX_BATCH_SIZE]{{ $.TAffine }} - R [MAX_BATCH_SIZE]*{{ $.TAffine }} - batchSize int - cptP int - bucketIds map[uint32]struct{} - points []{{ $.TAffine }} - buckets *B -} - -func newBatch{{ $.TAffine }}[B ib{{ $.TAffine }}](buckets *B, points []{{ $.TAffine }}) Batch{{ $.TAffine }}[B] { - batchSize := len(*buckets) / 5 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } - return Batch{{ $.TAffine }}[B]{ - buckets: buckets, - points: points, - batchSize: batchSize, - bucketIds: make(map[uint32]struct{}, len(*buckets)/2), - } -} - -func (b *Batch{{ $.TAffine }}[B]) IsFull() bool { - return b.cptP == b.batchSize -} - -func (b *Batch{{ $.TAffine }}[B]) ExecuteAndReset() { - if b.cptP == 0 { - return - } - // for i := 0; i < len(b.R); i++ { - // b.R[i].Add(b.R[i], b.P[i]) - // } - BatchAdd{{ $.TAffine }}(b.R[:b.cptP], b.P[:b.cptP], b.cptP) - for k := range b.bucketIds { - delete(b.bucketIds, k) - } - // b.bucketIds = [MAX_BATCH_SIZE]uint32{} - b.cptP = 0 -} - -func (b *Batch{{ $.TAffine }}[B]) CanAdd(bID uint32) bool { - _, ok := b.bucketIds[bID] - return !ok -} - -func (b *Batch{{ $.TAffine }}[B]) Add(op batchOp) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &(*b.buckets)[op.bucketID] - P := &b.points[op.pointID>>1] - if P.IsInfinity() { - return - } - // handle special cases with inf or -P / P - if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(P) - } else { - BK.Set(P) - } - return - } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(P) { - BK.setInfinity() - return - } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) { - BK.setInfinity() - return - } - } - - // b.bucketIds[b.cptP] = op.bucketID - b.bucketIds[op.bucketID] = struct{}{} - b.R[b.cptP] = BK - if op.isNeg() { - b.P[b.cptP].Neg(P) - } else { - b.P[b.cptP].Set(P) - } - b.cptP++ -} - -func processQueue{{ $.TAffine }}[B ib{{ $.TAffine }}](queue []batchOp, batch *Batch{{ $.TAffine }}[B]) []batchOp { - for i := len(queue) - 1; i >= 0; i-- { - if batch.CanAdd(queue[i].bucketID) { - batch.Add(queue[i]) - if batch.IsFull() { - batch.ExecuteAndReset() - } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - } - } - return queue - -} - {{end }} diff --git a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl index 8fb94f9f5b..ee1f1d2080 100644 --- a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl +++ b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl @@ -19,7 +19,7 @@ func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, - pscalars []uint32) { + digits []uint32) { @@ -29,20 +29,18 @@ func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk } // for each scalars, get the digit corresponding to the chunk we're processing. - for i := 0; i < len(pscalars); i++ { - bits := pscalars[i] - - if bits == 0 { + for i, digit := range digits { + if digit == 0 { continue } // if msbWindow bit is set, we need to substract - if bits & 1 == 0 { + if digit & 1 == 0 { // add - buckets[(bits>>1)-1].addMixed(&points[i]) + buckets[(digit>>1)-1].addMixed(&points[i]) } else { // sub - buckets[(bits>>1)].subMixed(&points[i]) + buckets[(digit>>1)].subMixed(&points[i]) } } diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index 5fa8d37944..49f36bb6b3 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -287,7 +287,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { var testPoint {{ $.TAffine }} - for i := 11; i <= pow; i++ { + for i := 14; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { From 653877c58e38196aa069e434f90ad5a4a90ba9e0 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Wed, 9 Nov 2022 07:41:01 -0600 Subject: [PATCH 08/43] feat: toying with batch size --- ecc/bls12-377/multiexp_affine.go | 38 +++++++++---------- ecc/bls12-377/multiexp_test.go | 8 +--- ecc/bls12-378/multiexp_affine.go | 38 +++++++++---------- ecc/bls12-378/multiexp_test.go | 8 +--- ecc/bls12-381/multiexp_affine.go | 38 +++++++++---------- ecc/bls12-381/multiexp_test.go | 8 +--- ecc/bls24-315/multiexp_affine.go | 38 +++++++++---------- ecc/bls24-315/multiexp_test.go | 8 +--- ecc/bls24-317/multiexp_affine.go | 38 +++++++++---------- ecc/bls24-317/multiexp_test.go | 8 +--- ecc/bn254/multiexp_affine.go | 38 +++++++++---------- ecc/bn254/multiexp_test.go | 8 +--- ecc/bw6-633/multiexp_affine.go | 38 +++++++++---------- ecc/bw6-633/multiexp_test.go | 8 +--- ecc/bw6-756/multiexp_affine.go | 38 +++++++++---------- ecc/bw6-756/multiexp_test.go | 8 +--- ecc/bw6-761/multiexp_affine.go | 38 +++++++++---------- ecc/bw6-761/multiexp_test.go | 8 +--- .../ecc/template/multiexp_affine.go.tmpl | 22 +++++------ .../ecc/template/tests/multiexp.go.tmpl | 8 +--- 20 files changed, 201 insertions(+), 243 deletions(-) diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index c2e56a6936..015864143d 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -16,7 +16,7 @@ package bls12377 -const MAX_BATCH_SIZE = 600 +const MAX_BATCH_SIZE = 2000 type batchOp struct { bucketID, pointID uint32 @@ -45,7 +45,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 5 + batchSize := len(buckets) / 30 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } @@ -120,7 +120,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. processQueue := func() { // for i := len(queue) - 1; i >= 0; i-- { @@ -159,19 +159,19 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if isFull() { executeAndReset() nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - // processQueue() + // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + // add(queue[len(queue)-1]) + // queue = queue[:len(queue)-1] + // } + processQueue() } } else { // put it in queue. queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", + // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() for len(queue) != 0 { processQueue() @@ -253,7 +253,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 5 + batchSize := len(buckets) / 30 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } @@ -328,7 +328,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. processQueue := func() { // for i := len(queue) - 1; i >= 0; i-- { @@ -367,19 +367,19 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if isFull() { executeAndReset() nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - // processQueue() + // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + // add(queue[len(queue)-1]) + // queue = queue[:len(queue)-1] + // } + processQueue() } } else { // put it in queue. queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", + // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() for len(queue) != 0 { processQueue() diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index afc5108951..72f274c242 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -698,11 +698,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index 64ca8320b2..fa5dc6c792 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -16,7 +16,7 @@ package bls12378 -const MAX_BATCH_SIZE = 600 +const MAX_BATCH_SIZE = 2000 type batchOp struct { bucketID, pointID uint32 @@ -45,7 +45,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 5 + batchSize := len(buckets) / 30 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } @@ -120,7 +120,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. processQueue := func() { // for i := len(queue) - 1; i >= 0; i-- { @@ -159,19 +159,19 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if isFull() { executeAndReset() nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - // processQueue() + // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + // add(queue[len(queue)-1]) + // queue = queue[:len(queue)-1] + // } + processQueue() } } else { // put it in queue. queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", + // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() for len(queue) != 0 { processQueue() @@ -253,7 +253,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 5 + batchSize := len(buckets) / 30 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } @@ -328,7 +328,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. processQueue := func() { // for i := len(queue) - 1; i >= 0; i-- { @@ -367,19 +367,19 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if isFull() { executeAndReset() nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - // processQueue() + // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + // add(queue[len(queue)-1]) + // queue = queue[:len(queue)-1] + // } + processQueue() } } else { // put it in queue. queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", + // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() for len(queue) != 0 { processQueue() diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index a77f7097e1..85f7b72e69 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -698,11 +698,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index c965e24de2..f8ae40fb18 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -16,7 +16,7 @@ package bls12381 -const MAX_BATCH_SIZE = 600 +const MAX_BATCH_SIZE = 2000 type batchOp struct { bucketID, pointID uint32 @@ -45,7 +45,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 5 + batchSize := len(buckets) / 30 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } @@ -120,7 +120,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. processQueue := func() { // for i := len(queue) - 1; i >= 0; i-- { @@ -159,19 +159,19 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if isFull() { executeAndReset() nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - // processQueue() + // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + // add(queue[len(queue)-1]) + // queue = queue[:len(queue)-1] + // } + processQueue() } } else { // put it in queue. queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", + // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() for len(queue) != 0 { processQueue() @@ -253,7 +253,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 5 + batchSize := len(buckets) / 30 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } @@ -328,7 +328,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. processQueue := func() { // for i := len(queue) - 1; i >= 0; i-- { @@ -367,19 +367,19 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if isFull() { executeAndReset() nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - // processQueue() + // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + // add(queue[len(queue)-1]) + // queue = queue[:len(queue)-1] + // } + processQueue() } } else { // put it in queue. queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", + // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() for len(queue) != 0 { processQueue() diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index 457546524f..dbcdc6eb85 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -698,11 +698,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index 10c47b3306..485f6960ff 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -16,7 +16,7 @@ package bls24315 -const MAX_BATCH_SIZE = 600 +const MAX_BATCH_SIZE = 2000 type batchOp struct { bucketID, pointID uint32 @@ -45,7 +45,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 5 + batchSize := len(buckets) / 30 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } @@ -120,7 +120,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. processQueue := func() { // for i := len(queue) - 1; i >= 0; i-- { @@ -159,19 +159,19 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if isFull() { executeAndReset() nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - // processQueue() + // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + // add(queue[len(queue)-1]) + // queue = queue[:len(queue)-1] + // } + processQueue() } } else { // put it in queue. queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", + // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() for len(queue) != 0 { processQueue() @@ -253,7 +253,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 5 + batchSize := len(buckets) / 30 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } @@ -328,7 +328,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. processQueue := func() { // for i := len(queue) - 1; i >= 0; i-- { @@ -367,19 +367,19 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if isFull() { executeAndReset() nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - // processQueue() + // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + // add(queue[len(queue)-1]) + // queue = queue[:len(queue)-1] + // } + processQueue() } } else { // put it in queue. queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", + // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() for len(queue) != 0 { processQueue() diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index 1b697194a5..b2978d6aa0 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -698,11 +698,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index 2833b83137..432592fa23 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -16,7 +16,7 @@ package bls24317 -const MAX_BATCH_SIZE = 600 +const MAX_BATCH_SIZE = 2000 type batchOp struct { bucketID, pointID uint32 @@ -45,7 +45,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 5 + batchSize := len(buckets) / 30 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } @@ -120,7 +120,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. processQueue := func() { // for i := len(queue) - 1; i >= 0; i-- { @@ -159,19 +159,19 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if isFull() { executeAndReset() nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - // processQueue() + // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + // add(queue[len(queue)-1]) + // queue = queue[:len(queue)-1] + // } + processQueue() } } else { // put it in queue. queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", + // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() for len(queue) != 0 { processQueue() @@ -253,7 +253,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 5 + batchSize := len(buckets) / 30 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } @@ -328,7 +328,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. processQueue := func() { // for i := len(queue) - 1; i >= 0; i-- { @@ -367,19 +367,19 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if isFull() { executeAndReset() nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - // processQueue() + // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + // add(queue[len(queue)-1]) + // queue = queue[:len(queue)-1] + // } + processQueue() } } else { // put it in queue. queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", + // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() for len(queue) != 0 { processQueue() diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index eb190a317b..799b903db7 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -698,11 +698,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index 5fea03fa0e..42d6413264 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -16,7 +16,7 @@ package bn254 -const MAX_BATCH_SIZE = 600 +const MAX_BATCH_SIZE = 2000 type batchOp struct { bucketID, pointID uint32 @@ -45,7 +45,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 5 + batchSize := len(buckets) / 30 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } @@ -120,7 +120,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. processQueue := func() { // for i := len(queue) - 1; i >= 0; i-- { @@ -159,19 +159,19 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if isFull() { executeAndReset() nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - // processQueue() + // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + // add(queue[len(queue)-1]) + // queue = queue[:len(queue)-1] + // } + processQueue() } } else { // put it in queue. queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", + // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() for len(queue) != 0 { processQueue() @@ -253,7 +253,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 5 + batchSize := len(buckets) / 30 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } @@ -328,7 +328,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. processQueue := func() { // for i := len(queue) - 1; i >= 0; i-- { @@ -367,19 +367,19 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if isFull() { executeAndReset() nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - // processQueue() + // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + // add(queue[len(queue)-1]) + // queue = queue[:len(queue)-1] + // } + processQueue() } } else { // put it in queue. queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", + // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() for len(queue) != 0 { processQueue() diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index 05d133cdbe..7379ddccbd 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -698,11 +698,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index 8305011734..49484c36c9 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -16,7 +16,7 @@ package bw6633 -const MAX_BATCH_SIZE = 600 +const MAX_BATCH_SIZE = 2000 type batchOp struct { bucketID, pointID uint32 @@ -45,7 +45,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 5 + batchSize := len(buckets) / 30 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } @@ -120,7 +120,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. processQueue := func() { // for i := len(queue) - 1; i >= 0; i-- { @@ -159,19 +159,19 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if isFull() { executeAndReset() nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - // processQueue() + // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + // add(queue[len(queue)-1]) + // queue = queue[:len(queue)-1] + // } + processQueue() } } else { // put it in queue. queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", + // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() for len(queue) != 0 { processQueue() @@ -231,7 +231,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 5 + batchSize := len(buckets) / 30 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } @@ -306,7 +306,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. processQueue := func() { // for i := len(queue) - 1; i >= 0; i-- { @@ -345,19 +345,19 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if isFull() { executeAndReset() nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - // processQueue() + // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + // add(queue[len(queue)-1]) + // queue = queue[:len(queue)-1] + // } + processQueue() } } else { // put it in queue. queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", + // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() for len(queue) != 0 { processQueue() diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index 75e1904bf0..b367a2fe2f 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -698,11 +698,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index 8b01509015..e1daea4ffe 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -16,7 +16,7 @@ package bw6756 -const MAX_BATCH_SIZE = 600 +const MAX_BATCH_SIZE = 2000 type batchOp struct { bucketID, pointID uint32 @@ -45,7 +45,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 5 + batchSize := len(buckets) / 30 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } @@ -120,7 +120,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. processQueue := func() { // for i := len(queue) - 1; i >= 0; i-- { @@ -159,19 +159,19 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if isFull() { executeAndReset() nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - // processQueue() + // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + // add(queue[len(queue)-1]) + // queue = queue[:len(queue)-1] + // } + processQueue() } } else { // put it in queue. queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", + // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() for len(queue) != 0 { processQueue() @@ -231,7 +231,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 5 + batchSize := len(buckets) / 30 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } @@ -306,7 +306,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. processQueue := func() { // for i := len(queue) - 1; i >= 0; i-- { @@ -345,19 +345,19 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if isFull() { executeAndReset() nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - // processQueue() + // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + // add(queue[len(queue)-1]) + // queue = queue[:len(queue)-1] + // } + processQueue() } } else { // put it in queue. queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", + // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() for len(queue) != 0 { processQueue() diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index c37051d70d..f82d71c32f 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -698,11 +698,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index 6cdd72b7a4..9edda2244b 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -16,7 +16,7 @@ package bw6761 -const MAX_BATCH_SIZE = 600 +const MAX_BATCH_SIZE = 2000 type batchOp struct { bucketID, pointID uint32 @@ -45,7 +45,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 5 + batchSize := len(buckets) / 30 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } @@ -120,7 +120,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. processQueue := func() { // for i := len(queue) - 1; i >= 0; i-- { @@ -159,19 +159,19 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if isFull() { executeAndReset() nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - // processQueue() + // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + // add(queue[len(queue)-1]) + // queue = queue[:len(queue)-1] + // } + processQueue() } } else { // put it in queue. queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", + // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() for len(queue) != 0 { processQueue() @@ -231,7 +231,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 5 + batchSize := len(buckets) / 30 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } @@ -306,7 +306,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. processQueue := func() { // for i := len(queue) - 1; i >= 0; i-- { @@ -345,19 +345,19 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if isFull() { executeAndReset() nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - // processQueue() + // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + // add(queue[len(queue)-1]) + // queue = queue[:len(queue)-1] + // } + processQueue() } } else { // put it in queue. queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", + // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() for len(queue) != 0 { processQueue() diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index b0a94d79dd..884a8564f8 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -698,11 +698,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index 8fe8391308..a075f3b083 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -7,9 +7,7 @@ {{ $G2TJacobianExtended := print (toLower .G2.PointName) "JacExtended" }} - - -const MAX_BATCH_SIZE = 600 +const MAX_BATCH_SIZE = 2000 type batchOp struct { bucketID, pointID uint32 @@ -48,7 +46,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64 } // setup for the batch affine; - batchSize := len(buckets) / 5 + batchSize := len(buckets) / 30 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } @@ -124,7 +122,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64 } - queue := make([]batchOp, 0, 4096) // TODO find right capacity here. + queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. processQueue := func () { @@ -164,19 +162,19 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64 if isFull() { executeAndReset() nbBatches++ - if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - add(queue[len(queue)-1]) - queue = queue[:len(queue)-1] - } - // processQueue() + // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing + // add(queue[len(queue)-1]) + // queue = queue[:len(queue)-1] + // } + processQueue() } } else { // put it in queue. queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n", - // chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points)) + // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", + // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() for len(queue) != 0 { processQueue() diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index 49f36bb6b3..c673254b2f 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -384,11 +384,7 @@ func fillBenchBases{{ toUpper $.PointName }}(samplePoints []{{ $.TAffine }}) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } From e43bb7674c04056ce08f91ef593b345d01ae86e4 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Wed, 9 Nov 2022 11:55:36 -0600 Subject: [PATCH 09/43] perf: msm affine OK on x86 --- ecc/bls12-377/g1.go | 38 +++--- ecc/bls12-377/g2.go | 38 +++--- ecc/bls12-377/multiexp.go | 20 +-- ecc/bls12-377/multiexp_affine.go | 116 ++++++++++-------- ecc/bls12-377/multiexp_jacobian.go | 12 +- ecc/bls12-377/multiexp_test.go | 28 ++--- ecc/bls12-378/g1.go | 38 +++--- ecc/bls12-378/g2.go | 38 +++--- ecc/bls12-378/multiexp.go | 20 +-- ecc/bls12-378/multiexp_affine.go | 116 ++++++++++-------- ecc/bls12-378/multiexp_jacobian.go | 12 +- ecc/bls12-378/multiexp_test.go | 28 ++--- ecc/bls12-381/g1.go | 38 +++--- ecc/bls12-381/g2.go | 38 +++--- ecc/bls12-381/multiexp.go | 20 +-- ecc/bls12-381/multiexp_affine.go | 116 ++++++++++-------- ecc/bls12-381/multiexp_jacobian.go | 12 +- ecc/bls12-381/multiexp_test.go | 28 ++--- ecc/bls24-315/g1.go | 38 +++--- ecc/bls24-315/g2.go | 38 +++--- ecc/bls24-315/multiexp.go | 20 +-- ecc/bls24-315/multiexp_affine.go | 116 ++++++++++-------- ecc/bls24-315/multiexp_jacobian.go | 12 +- ecc/bls24-315/multiexp_test.go | 28 ++--- ecc/bls24-317/g1.go | 38 +++--- ecc/bls24-317/g2.go | 38 +++--- ecc/bls24-317/multiexp.go | 20 +-- ecc/bls24-317/multiexp_affine.go | 116 ++++++++++-------- ecc/bls24-317/multiexp_jacobian.go | 12 +- ecc/bls24-317/multiexp_test.go | 28 ++--- ecc/bn254/g1.go | 38 +++--- ecc/bn254/g2.go | 38 +++--- ecc/bn254/multiexp.go | 20 +-- ecc/bn254/multiexp_affine.go | 116 ++++++++++-------- ecc/bn254/multiexp_jacobian.go | 12 +- ecc/bn254/multiexp_test.go | 28 ++--- ecc/bw6-633/g1.go | 38 +++--- ecc/bw6-633/g2.go | 38 +++--- ecc/bw6-633/multiexp_affine.go | 104 ++++++++++------ ecc/bw6-633/multiexp_test.go | 26 +--- ecc/bw6-756/g1.go | 38 +++--- ecc/bw6-756/g2.go | 38 +++--- ecc/bw6-756/multiexp_affine.go | 104 ++++++++++------ ecc/bw6-756/multiexp_test.go | 26 +--- ecc/bw6-761/g1.go | 38 +++--- ecc/bw6-761/g2.go | 38 +++--- ecc/bw6-761/multiexp_affine.go | 104 ++++++++++------ ecc/bw6-761/multiexp_test.go | 26 +--- internal/generator/config/curve.go | 2 +- .../ecc/template/multiexp_affine.go.tmpl | 52 +++++--- internal/generator/ecc/template/point.go.tmpl | 36 +++--- .../ecc/template/tests/multiexp.go.tmpl | 16 +-- 52 files changed, 1042 insertions(+), 1194 deletions(-) diff --git a/ecc/bls12-377/g1.go b/ecc/bls12-377/g1.go index 3b436a6b2b..962a527a2e 100644 --- a/ecc/bls12-377/g1.go +++ b/ecc/bls12-377/g1.go @@ -983,31 +983,27 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // batch add/dbl in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { +func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { + batchSize := len(R) if batchSize == 0 { return } var isDbl [MAX_BATCH_SIZE]bool - var lambda [MAX_BATCH_SIZE]fp.Element - - { - var lambdain [MAX_BATCH_SIZE]fp.Element - - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } - } - - // invert denominator - BatchInvertG1Affine(&lambda, &lambdain, batchSize) + var lambda, lambdain [MAX_BATCH_SIZE]fp.Element + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } } + // invert denominator + batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) + var d fp.Element var rr G1Affine @@ -1036,19 +1032,19 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { // batch inversion // similar to BatchInvertfp.Element, ignores edge cases -func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { +func batchInvertG1Affine(res, a []fp.Element) { var accumulator fp.Element accumulator.SetOne() - for i := 0; i < n; i++ { + for i := 0; i < len(res); i++ { res[i] = accumulator accumulator.Mul(&accumulator, &a[i]) } accumulator.Inverse(&accumulator) - for i := n - 1; i >= 0; i-- { + for i := len(res) - 1; i >= 0; i-- { res[i].Mul(&res[i], &accumulator) accumulator.Mul(&accumulator, &a[i]) } diff --git a/ecc/bls12-377/g2.go b/ecc/bls12-377/g2.go index 18810fe510..dd09808a13 100644 --- a/ecc/bls12-377/g2.go +++ b/ecc/bls12-377/g2.go @@ -979,31 +979,27 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // batch add/dbl in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { +func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { + batchSize := len(R) if batchSize == 0 { return } var isDbl [MAX_BATCH_SIZE]bool - var lambda [MAX_BATCH_SIZE]fptower.E2 - - { - var lambdain [MAX_BATCH_SIZE]fptower.E2 - - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } - } - - // invert denominator - BatchInvertG2Affine(&lambda, &lambdain, batchSize) + var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2 + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } } + // invert denominator + batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) + var d fptower.E2 var rr G2Affine @@ -1032,19 +1028,19 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { // batch inversion // similar to BatchInvertfptower.E2, ignores edge cases -func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E2, n int) { +func batchInvertG2Affine(res, a []fptower.E2) { var accumulator fptower.E2 accumulator.SetOne() - for i := 0; i < n; i++ { + for i := 0; i < len(res); i++ { res[i] = accumulator accumulator.Mul(&accumulator, &a[i]) } accumulator.Inverse(&accumulator) - for i := n - 1; i >= 0; i-- { + for i := len(res) - 1; i >= 0; i-- { res[i].Mul(&res[i], &accumulator) accumulator.Mul(&accumulator, &a[i]) } diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index 89db336b35..98de6ca242 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -84,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -198,14 +198,6 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) - case 20: - processChunk := processChunkG1BatchAffine[bucketG1AffineC20] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) - case 21: - processChunk := processChunkG1BatchAffine[bucketG1AffineC21] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -338,7 +330,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -452,14 +444,6 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) - case 20: - processChunk := processChunkG2BatchAffine[bucketG2AffineC20] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) - case 21: - processChunk := processChunkG2BatchAffine[bucketG2AffineC21] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index 015864143d..66c9d4e8bf 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -16,7 +16,7 @@ package bls12377 -const MAX_BATCH_SIZE = 2000 +const MAX_BATCH_SIZE = 600 type batchOp struct { bucketID, pointID uint32 @@ -45,15 +45,15 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 30 + batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here - cptP := 0 // count the number of point added to current batch + bucketIds := make(map[uint32]struct{}, batchSize) + cptP := 0 // count the number of point added to current batch var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack var R [MAX_BATCH_SIZE]*G1Affine // ... @@ -71,7 +71,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if cptP == 0 { return } - BatchAddG1Affine(R[:cptP], P[:cptP], cptP) + BatchAddG1Affine(R[:cptP], P[:cptP]) for k := range bucketIds { delete(bucketIds, k) } @@ -120,24 +120,36 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. + // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. + var queue [MAX_BATCH_SIZE]batchOp + qID := 0 processQueue := func() { - // for i := len(queue) - 1; i >= 0; i-- { - for i := 0; i < len(queue); i++ { + for i := qID - 1; i >= 0; i-- { if canAdd(queue[i].bucketID) { add(queue[i]) if isFull() { executeAndReset() } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - i-- + queue[i] = queue[qID-1] + qID-- } } } - nbBatches := 0 + processTopQueue := func() { + for i := qID - 1; i >= 0; i-- { + if !canAdd(queue[i].bucketID) { + return + } + add(queue[i]) + if isFull() { + executeAndReset() + } + qID-- + } + } + for i, digit := range digits { if digit == 0 { @@ -158,22 +170,23 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, add(op) if isFull() { executeAndReset() - nbBatches++ - // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - // add(queue[len(queue)-1]) - // queue = queue[:len(queue)-1] - // } - processQueue() + processTopQueue() } } else { // put it in queue. - queue = append(queue, op) + queue[qID] = op + qID++ + if qID == MAX_BATCH_SIZE-1 { + executeAndReset() + processQueue() + } + // queue = append(queue, op) } } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() - for len(queue) != 0 { + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. } @@ -213,8 +226,6 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine type bucketG1AffineC14 [1 << (14 - 1)]G1Affine type bucketG1AffineC15 [1 << (15 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine -type bucketG1AffineC20 [1 << (20 - 1)]G1Affine -type bucketG1AffineC21 [1 << (21 - 1)]G1Affine type ibG1Affine interface { bucketG1AffineC4 | @@ -229,9 +240,7 @@ type ibG1Affine interface { bucketG1AffineC13 | bucketG1AffineC14 | bucketG1AffineC15 | - bucketG1AffineC16 | - bucketG1AffineC20 | - bucketG1AffineC21 + bucketG1AffineC16 } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -253,15 +262,15 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 30 + batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here - cptP := 0 // count the number of point added to current batch + bucketIds := make(map[uint32]struct{}, batchSize) + cptP := 0 // count the number of point added to current batch var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack var R [MAX_BATCH_SIZE]*G2Affine // ... @@ -279,7 +288,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if cptP == 0 { return } - BatchAddG2Affine(R[:cptP], P[:cptP], cptP) + BatchAddG2Affine(R[:cptP], P[:cptP]) for k := range bucketIds { delete(bucketIds, k) } @@ -328,24 +337,36 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. + // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. + var queue [MAX_BATCH_SIZE]batchOp + qID := 0 processQueue := func() { - // for i := len(queue) - 1; i >= 0; i-- { - for i := 0; i < len(queue); i++ { + for i := qID - 1; i >= 0; i-- { if canAdd(queue[i].bucketID) { add(queue[i]) if isFull() { executeAndReset() } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - i-- + queue[i] = queue[qID-1] + qID-- } } } - nbBatches := 0 + processTopQueue := func() { + for i := qID - 1; i >= 0; i-- { + if !canAdd(queue[i].bucketID) { + return + } + add(queue[i]) + if isFull() { + executeAndReset() + } + qID-- + } + } + for i, digit := range digits { if digit == 0 { @@ -366,22 +387,23 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, add(op) if isFull() { executeAndReset() - nbBatches++ - // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - // add(queue[len(queue)-1]) - // queue = queue[:len(queue)-1] - // } - processQueue() + processTopQueue() } } else { // put it in queue. - queue = append(queue, op) + queue[qID] = op + qID++ + if qID == MAX_BATCH_SIZE-1 { + executeAndReset() + processQueue() + } + // queue = append(queue, op) } } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() - for len(queue) != 0 { + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. } @@ -421,8 +443,6 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine type bucketG2AffineC14 [1 << (14 - 1)]G2Affine type bucketG2AffineC15 [1 << (15 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine -type bucketG2AffineC20 [1 << (20 - 1)]G2Affine -type bucketG2AffineC21 [1 << (21 - 1)]G2Affine type ibG2Affine interface { bucketG2AffineC4 | @@ -437,7 +457,5 @@ type ibG2Affine interface { bucketG2AffineC13 | bucketG2AffineC14 | bucketG2AffineC15 | - bucketG2AffineC16 | - bucketG2AffineC20 | - bucketG2AffineC21 + bucketG2AffineC16 } diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go index 2c95e7f536..be722067bd 100644 --- a/ecc/bls12-377/multiexp_jacobian.go +++ b/ecc/bls12-377/multiexp_jacobian.go @@ -74,8 +74,6 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended -type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended @@ -94,9 +92,7 @@ type ibg1JacExtended interface { bucketg1JacExtendedC13 | bucketg1JacExtendedC14 | bucketg1JacExtendedC15 | - bucketg1JacExtendedC16 | - bucketg1JacExtendedC20 | - bucketg1JacExtendedC21 + bucketg1JacExtendedC16 } func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, @@ -157,8 +153,6 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended -type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended @@ -177,7 +171,5 @@ type ibg2JacExtended interface { bucketg2JacExtendedC13 | bucketg2JacExtendedC14 | bucketg2JacExtendedC15 | - bucketg2JacExtendedC16 | - bucketg2JacExtendedC20 | - bucketg2JacExtendedC21 + bucketg2JacExtendedC16 } diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index 72f274c242..ca2e50f59c 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -275,17 +275,10 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 14; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) @@ -295,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) { } func BenchmarkMultiExpG1Reference(b *testing.B) { - const nbSamples = 1 << 20 + const nbSamples = 1 << 23 var ( samplePoints [nbSamples]G1Affine @@ -606,17 +599,10 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 14; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) @@ -626,7 +612,7 @@ func BenchmarkMultiExpG2(b *testing.B) { } func BenchmarkMultiExpG2Reference(b *testing.B) { - const nbSamples = 1 << 20 + const nbSamples = 1 << 23 var ( samplePoints [nbSamples]G2Affine diff --git a/ecc/bls12-378/g1.go b/ecc/bls12-378/g1.go index 5b9ec0f84f..8422e95efb 100644 --- a/ecc/bls12-378/g1.go +++ b/ecc/bls12-378/g1.go @@ -983,31 +983,27 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // batch add/dbl in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { +func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { + batchSize := len(R) if batchSize == 0 { return } var isDbl [MAX_BATCH_SIZE]bool - var lambda [MAX_BATCH_SIZE]fp.Element - - { - var lambdain [MAX_BATCH_SIZE]fp.Element - - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } - } - - // invert denominator - BatchInvertG1Affine(&lambda, &lambdain, batchSize) + var lambda, lambdain [MAX_BATCH_SIZE]fp.Element + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } } + // invert denominator + batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) + var d fp.Element var rr G1Affine @@ -1036,19 +1032,19 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { // batch inversion // similar to BatchInvertfp.Element, ignores edge cases -func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { +func batchInvertG1Affine(res, a []fp.Element) { var accumulator fp.Element accumulator.SetOne() - for i := 0; i < n; i++ { + for i := 0; i < len(res); i++ { res[i] = accumulator accumulator.Mul(&accumulator, &a[i]) } accumulator.Inverse(&accumulator) - for i := n - 1; i >= 0; i-- { + for i := len(res) - 1; i >= 0; i-- { res[i].Mul(&res[i], &accumulator) accumulator.Mul(&accumulator, &a[i]) } diff --git a/ecc/bls12-378/g2.go b/ecc/bls12-378/g2.go index 0010b3983b..9cca73e6b3 100644 --- a/ecc/bls12-378/g2.go +++ b/ecc/bls12-378/g2.go @@ -979,31 +979,27 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // batch add/dbl in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { +func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { + batchSize := len(R) if batchSize == 0 { return } var isDbl [MAX_BATCH_SIZE]bool - var lambda [MAX_BATCH_SIZE]fptower.E2 - - { - var lambdain [MAX_BATCH_SIZE]fptower.E2 - - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } - } - - // invert denominator - BatchInvertG2Affine(&lambda, &lambdain, batchSize) + var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2 + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } } + // invert denominator + batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) + var d fptower.E2 var rr G2Affine @@ -1032,19 +1028,19 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { // batch inversion // similar to BatchInvertfptower.E2, ignores edge cases -func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E2, n int) { +func batchInvertG2Affine(res, a []fptower.E2) { var accumulator fptower.E2 accumulator.SetOne() - for i := 0; i < n; i++ { + for i := 0; i < len(res); i++ { res[i] = accumulator accumulator.Mul(&accumulator, &a[i]) } accumulator.Inverse(&accumulator) - for i := n - 1; i >= 0; i-- { + for i := len(res) - 1; i >= 0; i-- { res[i].Mul(&res[i], &accumulator) accumulator.Mul(&accumulator, &a[i]) } diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index 5914a5a0d6..917f493796 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -84,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -198,14 +198,6 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) - case 20: - processChunk := processChunkG1BatchAffine[bucketG1AffineC20] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) - case 21: - processChunk := processChunkG1BatchAffine[bucketG1AffineC21] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -338,7 +330,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -452,14 +444,6 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) - case 20: - processChunk := processChunkG2BatchAffine[bucketG2AffineC20] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) - case 21: - processChunk := processChunkG2BatchAffine[bucketG2AffineC21] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index fa5dc6c792..a48d9d1cfd 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -16,7 +16,7 @@ package bls12378 -const MAX_BATCH_SIZE = 2000 +const MAX_BATCH_SIZE = 600 type batchOp struct { bucketID, pointID uint32 @@ -45,15 +45,15 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 30 + batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here - cptP := 0 // count the number of point added to current batch + bucketIds := make(map[uint32]struct{}, batchSize) + cptP := 0 // count the number of point added to current batch var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack var R [MAX_BATCH_SIZE]*G1Affine // ... @@ -71,7 +71,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if cptP == 0 { return } - BatchAddG1Affine(R[:cptP], P[:cptP], cptP) + BatchAddG1Affine(R[:cptP], P[:cptP]) for k := range bucketIds { delete(bucketIds, k) } @@ -120,24 +120,36 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. + // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. + var queue [MAX_BATCH_SIZE]batchOp + qID := 0 processQueue := func() { - // for i := len(queue) - 1; i >= 0; i-- { - for i := 0; i < len(queue); i++ { + for i := qID - 1; i >= 0; i-- { if canAdd(queue[i].bucketID) { add(queue[i]) if isFull() { executeAndReset() } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - i-- + queue[i] = queue[qID-1] + qID-- } } } - nbBatches := 0 + processTopQueue := func() { + for i := qID - 1; i >= 0; i-- { + if !canAdd(queue[i].bucketID) { + return + } + add(queue[i]) + if isFull() { + executeAndReset() + } + qID-- + } + } + for i, digit := range digits { if digit == 0 { @@ -158,22 +170,23 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, add(op) if isFull() { executeAndReset() - nbBatches++ - // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - // add(queue[len(queue)-1]) - // queue = queue[:len(queue)-1] - // } - processQueue() + processTopQueue() } } else { // put it in queue. - queue = append(queue, op) + queue[qID] = op + qID++ + if qID == MAX_BATCH_SIZE-1 { + executeAndReset() + processQueue() + } + // queue = append(queue, op) } } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() - for len(queue) != 0 { + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. } @@ -213,8 +226,6 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine type bucketG1AffineC14 [1 << (14 - 1)]G1Affine type bucketG1AffineC15 [1 << (15 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine -type bucketG1AffineC20 [1 << (20 - 1)]G1Affine -type bucketG1AffineC21 [1 << (21 - 1)]G1Affine type ibG1Affine interface { bucketG1AffineC4 | @@ -229,9 +240,7 @@ type ibG1Affine interface { bucketG1AffineC13 | bucketG1AffineC14 | bucketG1AffineC15 | - bucketG1AffineC16 | - bucketG1AffineC20 | - bucketG1AffineC21 + bucketG1AffineC16 } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -253,15 +262,15 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 30 + batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here - cptP := 0 // count the number of point added to current batch + bucketIds := make(map[uint32]struct{}, batchSize) + cptP := 0 // count the number of point added to current batch var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack var R [MAX_BATCH_SIZE]*G2Affine // ... @@ -279,7 +288,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if cptP == 0 { return } - BatchAddG2Affine(R[:cptP], P[:cptP], cptP) + BatchAddG2Affine(R[:cptP], P[:cptP]) for k := range bucketIds { delete(bucketIds, k) } @@ -328,24 +337,36 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. + // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. + var queue [MAX_BATCH_SIZE]batchOp + qID := 0 processQueue := func() { - // for i := len(queue) - 1; i >= 0; i-- { - for i := 0; i < len(queue); i++ { + for i := qID - 1; i >= 0; i-- { if canAdd(queue[i].bucketID) { add(queue[i]) if isFull() { executeAndReset() } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - i-- + queue[i] = queue[qID-1] + qID-- } } } - nbBatches := 0 + processTopQueue := func() { + for i := qID - 1; i >= 0; i-- { + if !canAdd(queue[i].bucketID) { + return + } + add(queue[i]) + if isFull() { + executeAndReset() + } + qID-- + } + } + for i, digit := range digits { if digit == 0 { @@ -366,22 +387,23 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, add(op) if isFull() { executeAndReset() - nbBatches++ - // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - // add(queue[len(queue)-1]) - // queue = queue[:len(queue)-1] - // } - processQueue() + processTopQueue() } } else { // put it in queue. - queue = append(queue, op) + queue[qID] = op + qID++ + if qID == MAX_BATCH_SIZE-1 { + executeAndReset() + processQueue() + } + // queue = append(queue, op) } } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() - for len(queue) != 0 { + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. } @@ -421,8 +443,6 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine type bucketG2AffineC14 [1 << (14 - 1)]G2Affine type bucketG2AffineC15 [1 << (15 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine -type bucketG2AffineC20 [1 << (20 - 1)]G2Affine -type bucketG2AffineC21 [1 << (21 - 1)]G2Affine type ibG2Affine interface { bucketG2AffineC4 | @@ -437,7 +457,5 @@ type ibG2Affine interface { bucketG2AffineC13 | bucketG2AffineC14 | bucketG2AffineC15 | - bucketG2AffineC16 | - bucketG2AffineC20 | - bucketG2AffineC21 + bucketG2AffineC16 } diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go index 3ce29436eb..6a8cfa2d32 100644 --- a/ecc/bls12-378/multiexp_jacobian.go +++ b/ecc/bls12-378/multiexp_jacobian.go @@ -74,8 +74,6 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended -type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended @@ -94,9 +92,7 @@ type ibg1JacExtended interface { bucketg1JacExtendedC13 | bucketg1JacExtendedC14 | bucketg1JacExtendedC15 | - bucketg1JacExtendedC16 | - bucketg1JacExtendedC20 | - bucketg1JacExtendedC21 + bucketg1JacExtendedC16 } func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, @@ -157,8 +153,6 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended -type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended @@ -177,7 +171,5 @@ type ibg2JacExtended interface { bucketg2JacExtendedC13 | bucketg2JacExtendedC14 | bucketg2JacExtendedC15 | - bucketg2JacExtendedC16 | - bucketg2JacExtendedC20 | - bucketg2JacExtendedC21 + bucketg2JacExtendedC16 } diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index 85f7b72e69..339323bbac 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -275,17 +275,10 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 14; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) @@ -295,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) { } func BenchmarkMultiExpG1Reference(b *testing.B) { - const nbSamples = 1 << 20 + const nbSamples = 1 << 23 var ( samplePoints [nbSamples]G1Affine @@ -606,17 +599,10 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 14; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) @@ -626,7 +612,7 @@ func BenchmarkMultiExpG2(b *testing.B) { } func BenchmarkMultiExpG2Reference(b *testing.B) { - const nbSamples = 1 << 20 + const nbSamples = 1 << 23 var ( samplePoints [nbSamples]G2Affine diff --git a/ecc/bls12-381/g1.go b/ecc/bls12-381/g1.go index eccf0c9c97..bb37dacb65 100644 --- a/ecc/bls12-381/g1.go +++ b/ecc/bls12-381/g1.go @@ -983,31 +983,27 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // batch add/dbl in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { +func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { + batchSize := len(R) if batchSize == 0 { return } var isDbl [MAX_BATCH_SIZE]bool - var lambda [MAX_BATCH_SIZE]fp.Element - - { - var lambdain [MAX_BATCH_SIZE]fp.Element - - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } - } - - // invert denominator - BatchInvertG1Affine(&lambda, &lambdain, batchSize) + var lambda, lambdain [MAX_BATCH_SIZE]fp.Element + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } } + // invert denominator + batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) + var d fp.Element var rr G1Affine @@ -1036,19 +1032,19 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { // batch inversion // similar to BatchInvertfp.Element, ignores edge cases -func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { +func batchInvertG1Affine(res, a []fp.Element) { var accumulator fp.Element accumulator.SetOne() - for i := 0; i < n; i++ { + for i := 0; i < len(res); i++ { res[i] = accumulator accumulator.Mul(&accumulator, &a[i]) } accumulator.Inverse(&accumulator) - for i := n - 1; i >= 0; i-- { + for i := len(res) - 1; i >= 0; i-- { res[i].Mul(&res[i], &accumulator) accumulator.Mul(&accumulator, &a[i]) } diff --git a/ecc/bls12-381/g2.go b/ecc/bls12-381/g2.go index 5264766d99..86ce9db5b6 100644 --- a/ecc/bls12-381/g2.go +++ b/ecc/bls12-381/g2.go @@ -980,31 +980,27 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // batch add/dbl in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { +func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { + batchSize := len(R) if batchSize == 0 { return } var isDbl [MAX_BATCH_SIZE]bool - var lambda [MAX_BATCH_SIZE]fptower.E2 - - { - var lambdain [MAX_BATCH_SIZE]fptower.E2 - - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } - } - - // invert denominator - BatchInvertG2Affine(&lambda, &lambdain, batchSize) + var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2 + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } } + // invert denominator + batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) + var d fptower.E2 var rr G2Affine @@ -1033,19 +1029,19 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { // batch inversion // similar to BatchInvertfptower.E2, ignores edge cases -func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E2, n int) { +func batchInvertG2Affine(res, a []fptower.E2) { var accumulator fptower.E2 accumulator.SetOne() - for i := 0; i < n; i++ { + for i := 0; i < len(res); i++ { res[i] = accumulator accumulator.Mul(&accumulator, &a[i]) } accumulator.Inverse(&accumulator) - for i := n - 1; i >= 0; i-- { + for i := len(res) - 1; i >= 0; i-- { res[i].Mul(&res[i], &accumulator) accumulator.Mul(&accumulator, &a[i]) } diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index 7f730ca946..8283ce4957 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -84,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -198,14 +198,6 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) - case 20: - processChunk := processChunkG1BatchAffine[bucketG1AffineC20] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) - case 21: - processChunk := processChunkG1BatchAffine[bucketG1AffineC21] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -338,7 +330,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -452,14 +444,6 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) - case 20: - processChunk := processChunkG2BatchAffine[bucketG2AffineC20] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) - case 21: - processChunk := processChunkG2BatchAffine[bucketG2AffineC21] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index f8ae40fb18..d30c10293e 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -16,7 +16,7 @@ package bls12381 -const MAX_BATCH_SIZE = 2000 +const MAX_BATCH_SIZE = 600 type batchOp struct { bucketID, pointID uint32 @@ -45,15 +45,15 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 30 + batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here - cptP := 0 // count the number of point added to current batch + bucketIds := make(map[uint32]struct{}, batchSize) + cptP := 0 // count the number of point added to current batch var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack var R [MAX_BATCH_SIZE]*G1Affine // ... @@ -71,7 +71,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if cptP == 0 { return } - BatchAddG1Affine(R[:cptP], P[:cptP], cptP) + BatchAddG1Affine(R[:cptP], P[:cptP]) for k := range bucketIds { delete(bucketIds, k) } @@ -120,24 +120,36 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. + // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. + var queue [MAX_BATCH_SIZE]batchOp + qID := 0 processQueue := func() { - // for i := len(queue) - 1; i >= 0; i-- { - for i := 0; i < len(queue); i++ { + for i := qID - 1; i >= 0; i-- { if canAdd(queue[i].bucketID) { add(queue[i]) if isFull() { executeAndReset() } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - i-- + queue[i] = queue[qID-1] + qID-- } } } - nbBatches := 0 + processTopQueue := func() { + for i := qID - 1; i >= 0; i-- { + if !canAdd(queue[i].bucketID) { + return + } + add(queue[i]) + if isFull() { + executeAndReset() + } + qID-- + } + } + for i, digit := range digits { if digit == 0 { @@ -158,22 +170,23 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, add(op) if isFull() { executeAndReset() - nbBatches++ - // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - // add(queue[len(queue)-1]) - // queue = queue[:len(queue)-1] - // } - processQueue() + processTopQueue() } } else { // put it in queue. - queue = append(queue, op) + queue[qID] = op + qID++ + if qID == MAX_BATCH_SIZE-1 { + executeAndReset() + processQueue() + } + // queue = append(queue, op) } } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() - for len(queue) != 0 { + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. } @@ -213,8 +226,6 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine type bucketG1AffineC14 [1 << (14 - 1)]G1Affine type bucketG1AffineC15 [1 << (15 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine -type bucketG1AffineC20 [1 << (20 - 1)]G1Affine -type bucketG1AffineC21 [1 << (21 - 1)]G1Affine type ibG1Affine interface { bucketG1AffineC4 | @@ -229,9 +240,7 @@ type ibG1Affine interface { bucketG1AffineC13 | bucketG1AffineC14 | bucketG1AffineC15 | - bucketG1AffineC16 | - bucketG1AffineC20 | - bucketG1AffineC21 + bucketG1AffineC16 } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -253,15 +262,15 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 30 + batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here - cptP := 0 // count the number of point added to current batch + bucketIds := make(map[uint32]struct{}, batchSize) + cptP := 0 // count the number of point added to current batch var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack var R [MAX_BATCH_SIZE]*G2Affine // ... @@ -279,7 +288,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if cptP == 0 { return } - BatchAddG2Affine(R[:cptP], P[:cptP], cptP) + BatchAddG2Affine(R[:cptP], P[:cptP]) for k := range bucketIds { delete(bucketIds, k) } @@ -328,24 +337,36 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. + // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. + var queue [MAX_BATCH_SIZE]batchOp + qID := 0 processQueue := func() { - // for i := len(queue) - 1; i >= 0; i-- { - for i := 0; i < len(queue); i++ { + for i := qID - 1; i >= 0; i-- { if canAdd(queue[i].bucketID) { add(queue[i]) if isFull() { executeAndReset() } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - i-- + queue[i] = queue[qID-1] + qID-- } } } - nbBatches := 0 + processTopQueue := func() { + for i := qID - 1; i >= 0; i-- { + if !canAdd(queue[i].bucketID) { + return + } + add(queue[i]) + if isFull() { + executeAndReset() + } + qID-- + } + } + for i, digit := range digits { if digit == 0 { @@ -366,22 +387,23 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, add(op) if isFull() { executeAndReset() - nbBatches++ - // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - // add(queue[len(queue)-1]) - // queue = queue[:len(queue)-1] - // } - processQueue() + processTopQueue() } } else { // put it in queue. - queue = append(queue, op) + queue[qID] = op + qID++ + if qID == MAX_BATCH_SIZE-1 { + executeAndReset() + processQueue() + } + // queue = append(queue, op) } } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() - for len(queue) != 0 { + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. } @@ -421,8 +443,6 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine type bucketG2AffineC14 [1 << (14 - 1)]G2Affine type bucketG2AffineC15 [1 << (15 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine -type bucketG2AffineC20 [1 << (20 - 1)]G2Affine -type bucketG2AffineC21 [1 << (21 - 1)]G2Affine type ibG2Affine interface { bucketG2AffineC4 | @@ -437,7 +457,5 @@ type ibG2Affine interface { bucketG2AffineC13 | bucketG2AffineC14 | bucketG2AffineC15 | - bucketG2AffineC16 | - bucketG2AffineC20 | - bucketG2AffineC21 + bucketG2AffineC16 } diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go index 7c69354658..fabbf2d237 100644 --- a/ecc/bls12-381/multiexp_jacobian.go +++ b/ecc/bls12-381/multiexp_jacobian.go @@ -74,8 +74,6 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended -type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended @@ -94,9 +92,7 @@ type ibg1JacExtended interface { bucketg1JacExtendedC13 | bucketg1JacExtendedC14 | bucketg1JacExtendedC15 | - bucketg1JacExtendedC16 | - bucketg1JacExtendedC20 | - bucketg1JacExtendedC21 + bucketg1JacExtendedC16 } func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, @@ -157,8 +153,6 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended -type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended @@ -177,7 +171,5 @@ type ibg2JacExtended interface { bucketg2JacExtendedC13 | bucketg2JacExtendedC14 | bucketg2JacExtendedC15 | - bucketg2JacExtendedC16 | - bucketg2JacExtendedC20 | - bucketg2JacExtendedC21 + bucketg2JacExtendedC16 } diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index dbcdc6eb85..ce2153872e 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -275,17 +275,10 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 14; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) @@ -295,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) { } func BenchmarkMultiExpG1Reference(b *testing.B) { - const nbSamples = 1 << 20 + const nbSamples = 1 << 23 var ( samplePoints [nbSamples]G1Affine @@ -606,17 +599,10 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 14; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) @@ -626,7 +612,7 @@ func BenchmarkMultiExpG2(b *testing.B) { } func BenchmarkMultiExpG2Reference(b *testing.B) { - const nbSamples = 1 << 20 + const nbSamples = 1 << 23 var ( samplePoints [nbSamples]G2Affine diff --git a/ecc/bls24-315/g1.go b/ecc/bls24-315/g1.go index 173d24e902..e55d4ad4cb 100644 --- a/ecc/bls24-315/g1.go +++ b/ecc/bls24-315/g1.go @@ -985,31 +985,27 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // batch add/dbl in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { +func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { + batchSize := len(R) if batchSize == 0 { return } var isDbl [MAX_BATCH_SIZE]bool - var lambda [MAX_BATCH_SIZE]fp.Element - - { - var lambdain [MAX_BATCH_SIZE]fp.Element - - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } - } - - // invert denominator - BatchInvertG1Affine(&lambda, &lambdain, batchSize) + var lambda, lambdain [MAX_BATCH_SIZE]fp.Element + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } } + // invert denominator + batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) + var d fp.Element var rr G1Affine @@ -1038,19 +1034,19 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { // batch inversion // similar to BatchInvertfp.Element, ignores edge cases -func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { +func batchInvertG1Affine(res, a []fp.Element) { var accumulator fp.Element accumulator.SetOne() - for i := 0; i < n; i++ { + for i := 0; i < len(res); i++ { res[i] = accumulator accumulator.Mul(&accumulator, &a[i]) } accumulator.Inverse(&accumulator) - for i := n - 1; i >= 0; i-- { + for i := len(res) - 1; i >= 0; i-- { res[i].Mul(&res[i], &accumulator) accumulator.Mul(&accumulator, &a[i]) } diff --git a/ecc/bls24-315/g2.go b/ecc/bls24-315/g2.go index e498978c0b..f5dffd0752 100644 --- a/ecc/bls24-315/g2.go +++ b/ecc/bls24-315/g2.go @@ -995,31 +995,27 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // batch add/dbl in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { +func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { + batchSize := len(R) if batchSize == 0 { return } var isDbl [MAX_BATCH_SIZE]bool - var lambda [MAX_BATCH_SIZE]fptower.E4 - - { - var lambdain [MAX_BATCH_SIZE]fptower.E4 - - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } - } - - // invert denominator - BatchInvertG2Affine(&lambda, &lambdain, batchSize) + var lambda, lambdain [MAX_BATCH_SIZE]fptower.E4 + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } } + // invert denominator + batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) + var d fptower.E4 var rr G2Affine @@ -1048,19 +1044,19 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { // batch inversion // similar to BatchInvertfptower.E4, ignores edge cases -func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E4, n int) { +func batchInvertG2Affine(res, a []fptower.E4) { var accumulator fptower.E4 accumulator.SetOne() - for i := 0; i < n; i++ { + for i := 0; i < len(res); i++ { res[i] = accumulator accumulator.Mul(&accumulator, &a[i]) } accumulator.Inverse(&accumulator) - for i := n - 1; i >= 0; i-- { + for i := len(res) - 1; i >= 0; i-- { res[i].Mul(&res[i], &accumulator) accumulator.Mul(&accumulator, &a[i]) } diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index 37b43c6fe8..922c80cd89 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -84,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -198,14 +198,6 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) - case 20: - processChunk := processChunkG1BatchAffine[bucketG1AffineC20] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) - case 21: - processChunk := processChunkG1BatchAffine[bucketG1AffineC21] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -338,7 +330,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -452,14 +444,6 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) - case 20: - processChunk := processChunkG2BatchAffine[bucketG2AffineC20] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) - case 21: - processChunk := processChunkG2BatchAffine[bucketG2AffineC21] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index 485f6960ff..c7aa56e2d5 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -16,7 +16,7 @@ package bls24315 -const MAX_BATCH_SIZE = 2000 +const MAX_BATCH_SIZE = 600 type batchOp struct { bucketID, pointID uint32 @@ -45,15 +45,15 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 30 + batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here - cptP := 0 // count the number of point added to current batch + bucketIds := make(map[uint32]struct{}, batchSize) + cptP := 0 // count the number of point added to current batch var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack var R [MAX_BATCH_SIZE]*G1Affine // ... @@ -71,7 +71,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if cptP == 0 { return } - BatchAddG1Affine(R[:cptP], P[:cptP], cptP) + BatchAddG1Affine(R[:cptP], P[:cptP]) for k := range bucketIds { delete(bucketIds, k) } @@ -120,24 +120,36 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. + // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. + var queue [MAX_BATCH_SIZE]batchOp + qID := 0 processQueue := func() { - // for i := len(queue) - 1; i >= 0; i-- { - for i := 0; i < len(queue); i++ { + for i := qID - 1; i >= 0; i-- { if canAdd(queue[i].bucketID) { add(queue[i]) if isFull() { executeAndReset() } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - i-- + queue[i] = queue[qID-1] + qID-- } } } - nbBatches := 0 + processTopQueue := func() { + for i := qID - 1; i >= 0; i-- { + if !canAdd(queue[i].bucketID) { + return + } + add(queue[i]) + if isFull() { + executeAndReset() + } + qID-- + } + } + for i, digit := range digits { if digit == 0 { @@ -158,22 +170,23 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, add(op) if isFull() { executeAndReset() - nbBatches++ - // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - // add(queue[len(queue)-1]) - // queue = queue[:len(queue)-1] - // } - processQueue() + processTopQueue() } } else { // put it in queue. - queue = append(queue, op) + queue[qID] = op + qID++ + if qID == MAX_BATCH_SIZE-1 { + executeAndReset() + processQueue() + } + // queue = append(queue, op) } } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() - for len(queue) != 0 { + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. } @@ -213,8 +226,6 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine type bucketG1AffineC14 [1 << (14 - 1)]G1Affine type bucketG1AffineC15 [1 << (15 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine -type bucketG1AffineC20 [1 << (20 - 1)]G1Affine -type bucketG1AffineC21 [1 << (21 - 1)]G1Affine type ibG1Affine interface { bucketG1AffineC4 | @@ -229,9 +240,7 @@ type ibG1Affine interface { bucketG1AffineC13 | bucketG1AffineC14 | bucketG1AffineC15 | - bucketG1AffineC16 | - bucketG1AffineC20 | - bucketG1AffineC21 + bucketG1AffineC16 } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -253,15 +262,15 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 30 + batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here - cptP := 0 // count the number of point added to current batch + bucketIds := make(map[uint32]struct{}, batchSize) + cptP := 0 // count the number of point added to current batch var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack var R [MAX_BATCH_SIZE]*G2Affine // ... @@ -279,7 +288,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if cptP == 0 { return } - BatchAddG2Affine(R[:cptP], P[:cptP], cptP) + BatchAddG2Affine(R[:cptP], P[:cptP]) for k := range bucketIds { delete(bucketIds, k) } @@ -328,24 +337,36 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. + // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. + var queue [MAX_BATCH_SIZE]batchOp + qID := 0 processQueue := func() { - // for i := len(queue) - 1; i >= 0; i-- { - for i := 0; i < len(queue); i++ { + for i := qID - 1; i >= 0; i-- { if canAdd(queue[i].bucketID) { add(queue[i]) if isFull() { executeAndReset() } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - i-- + queue[i] = queue[qID-1] + qID-- } } } - nbBatches := 0 + processTopQueue := func() { + for i := qID - 1; i >= 0; i-- { + if !canAdd(queue[i].bucketID) { + return + } + add(queue[i]) + if isFull() { + executeAndReset() + } + qID-- + } + } + for i, digit := range digits { if digit == 0 { @@ -366,22 +387,23 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, add(op) if isFull() { executeAndReset() - nbBatches++ - // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - // add(queue[len(queue)-1]) - // queue = queue[:len(queue)-1] - // } - processQueue() + processTopQueue() } } else { // put it in queue. - queue = append(queue, op) + queue[qID] = op + qID++ + if qID == MAX_BATCH_SIZE-1 { + executeAndReset() + processQueue() + } + // queue = append(queue, op) } } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() - for len(queue) != 0 { + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. } @@ -421,8 +443,6 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine type bucketG2AffineC14 [1 << (14 - 1)]G2Affine type bucketG2AffineC15 [1 << (15 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine -type bucketG2AffineC20 [1 << (20 - 1)]G2Affine -type bucketG2AffineC21 [1 << (21 - 1)]G2Affine type ibG2Affine interface { bucketG2AffineC4 | @@ -437,7 +457,5 @@ type ibG2Affine interface { bucketG2AffineC13 | bucketG2AffineC14 | bucketG2AffineC15 | - bucketG2AffineC16 | - bucketG2AffineC20 | - bucketG2AffineC21 + bucketG2AffineC16 } diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go index 6663cc9e73..a3d633de01 100644 --- a/ecc/bls24-315/multiexp_jacobian.go +++ b/ecc/bls24-315/multiexp_jacobian.go @@ -74,8 +74,6 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended -type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended @@ -94,9 +92,7 @@ type ibg1JacExtended interface { bucketg1JacExtendedC13 | bucketg1JacExtendedC14 | bucketg1JacExtendedC15 | - bucketg1JacExtendedC16 | - bucketg1JacExtendedC20 | - bucketg1JacExtendedC21 + bucketg1JacExtendedC16 } func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, @@ -157,8 +153,6 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended -type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended @@ -177,7 +171,5 @@ type ibg2JacExtended interface { bucketg2JacExtendedC13 | bucketg2JacExtendedC14 | bucketg2JacExtendedC15 | - bucketg2JacExtendedC16 | - bucketg2JacExtendedC20 | - bucketg2JacExtendedC21 + bucketg2JacExtendedC16 } diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index b2978d6aa0..f8513bd3a1 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -275,17 +275,10 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 14; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) @@ -295,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) { } func BenchmarkMultiExpG1Reference(b *testing.B) { - const nbSamples = 1 << 20 + const nbSamples = 1 << 23 var ( samplePoints [nbSamples]G1Affine @@ -606,17 +599,10 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 14; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) @@ -626,7 +612,7 @@ func BenchmarkMultiExpG2(b *testing.B) { } func BenchmarkMultiExpG2Reference(b *testing.B) { - const nbSamples = 1 << 20 + const nbSamples = 1 << 23 var ( samplePoints [nbSamples]G2Affine diff --git a/ecc/bls24-317/g1.go b/ecc/bls24-317/g1.go index 9443125d34..58bee14819 100644 --- a/ecc/bls24-317/g1.go +++ b/ecc/bls24-317/g1.go @@ -985,31 +985,27 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // batch add/dbl in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { +func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { + batchSize := len(R) if batchSize == 0 { return } var isDbl [MAX_BATCH_SIZE]bool - var lambda [MAX_BATCH_SIZE]fp.Element - - { - var lambdain [MAX_BATCH_SIZE]fp.Element - - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } - } - - // invert denominator - BatchInvertG1Affine(&lambda, &lambdain, batchSize) + var lambda, lambdain [MAX_BATCH_SIZE]fp.Element + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } } + // invert denominator + batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) + var d fp.Element var rr G1Affine @@ -1038,19 +1034,19 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { // batch inversion // similar to BatchInvertfp.Element, ignores edge cases -func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { +func batchInvertG1Affine(res, a []fp.Element) { var accumulator fp.Element accumulator.SetOne() - for i := 0; i < n; i++ { + for i := 0; i < len(res); i++ { res[i] = accumulator accumulator.Mul(&accumulator, &a[i]) } accumulator.Inverse(&accumulator) - for i := n - 1; i >= 0; i-- { + for i := len(res) - 1; i >= 0; i-- { res[i].Mul(&res[i], &accumulator) accumulator.Mul(&accumulator, &a[i]) } diff --git a/ecc/bls24-317/g2.go b/ecc/bls24-317/g2.go index 0e2738e211..f5fb993fb4 100644 --- a/ecc/bls24-317/g2.go +++ b/ecc/bls24-317/g2.go @@ -995,31 +995,27 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // batch add/dbl in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { +func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { + batchSize := len(R) if batchSize == 0 { return } var isDbl [MAX_BATCH_SIZE]bool - var lambda [MAX_BATCH_SIZE]fptower.E4 - - { - var lambdain [MAX_BATCH_SIZE]fptower.E4 - - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } - } - - // invert denominator - BatchInvertG2Affine(&lambda, &lambdain, batchSize) + var lambda, lambdain [MAX_BATCH_SIZE]fptower.E4 + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } } + // invert denominator + batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) + var d fptower.E4 var rr G2Affine @@ -1048,19 +1044,19 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { // batch inversion // similar to BatchInvertfptower.E4, ignores edge cases -func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E4, n int) { +func batchInvertG2Affine(res, a []fptower.E4) { var accumulator fptower.E4 accumulator.SetOne() - for i := 0; i < n; i++ { + for i := 0; i < len(res); i++ { res[i] = accumulator accumulator.Mul(&accumulator, &a[i]) } accumulator.Inverse(&accumulator) - for i := n - 1; i >= 0; i-- { + for i := len(res) - 1; i >= 0; i-- { res[i].Mul(&res[i], &accumulator) accumulator.Mul(&accumulator, &a[i]) } diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index 5cb36f1788..923946e34f 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -84,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -198,14 +198,6 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) - case 20: - processChunk := processChunkG1BatchAffine[bucketG1AffineC20] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) - case 21: - processChunk := processChunkG1BatchAffine[bucketG1AffineC21] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -338,7 +330,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -452,14 +444,6 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) - case 20: - processChunk := processChunkG2BatchAffine[bucketG2AffineC20] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) - case 21: - processChunk := processChunkG2BatchAffine[bucketG2AffineC21] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index 432592fa23..ccd70a9474 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -16,7 +16,7 @@ package bls24317 -const MAX_BATCH_SIZE = 2000 +const MAX_BATCH_SIZE = 600 type batchOp struct { bucketID, pointID uint32 @@ -45,15 +45,15 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 30 + batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here - cptP := 0 // count the number of point added to current batch + bucketIds := make(map[uint32]struct{}, batchSize) + cptP := 0 // count the number of point added to current batch var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack var R [MAX_BATCH_SIZE]*G1Affine // ... @@ -71,7 +71,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if cptP == 0 { return } - BatchAddG1Affine(R[:cptP], P[:cptP], cptP) + BatchAddG1Affine(R[:cptP], P[:cptP]) for k := range bucketIds { delete(bucketIds, k) } @@ -120,24 +120,36 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. + // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. + var queue [MAX_BATCH_SIZE]batchOp + qID := 0 processQueue := func() { - // for i := len(queue) - 1; i >= 0; i-- { - for i := 0; i < len(queue); i++ { + for i := qID - 1; i >= 0; i-- { if canAdd(queue[i].bucketID) { add(queue[i]) if isFull() { executeAndReset() } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - i-- + queue[i] = queue[qID-1] + qID-- } } } - nbBatches := 0 + processTopQueue := func() { + for i := qID - 1; i >= 0; i-- { + if !canAdd(queue[i].bucketID) { + return + } + add(queue[i]) + if isFull() { + executeAndReset() + } + qID-- + } + } + for i, digit := range digits { if digit == 0 { @@ -158,22 +170,23 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, add(op) if isFull() { executeAndReset() - nbBatches++ - // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - // add(queue[len(queue)-1]) - // queue = queue[:len(queue)-1] - // } - processQueue() + processTopQueue() } } else { // put it in queue. - queue = append(queue, op) + queue[qID] = op + qID++ + if qID == MAX_BATCH_SIZE-1 { + executeAndReset() + processQueue() + } + // queue = append(queue, op) } } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() - for len(queue) != 0 { + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. } @@ -213,8 +226,6 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine type bucketG1AffineC14 [1 << (14 - 1)]G1Affine type bucketG1AffineC15 [1 << (15 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine -type bucketG1AffineC20 [1 << (20 - 1)]G1Affine -type bucketG1AffineC21 [1 << (21 - 1)]G1Affine type ibG1Affine interface { bucketG1AffineC4 | @@ -229,9 +240,7 @@ type ibG1Affine interface { bucketG1AffineC13 | bucketG1AffineC14 | bucketG1AffineC15 | - bucketG1AffineC16 | - bucketG1AffineC20 | - bucketG1AffineC21 + bucketG1AffineC16 } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -253,15 +262,15 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 30 + batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here - cptP := 0 // count the number of point added to current batch + bucketIds := make(map[uint32]struct{}, batchSize) + cptP := 0 // count the number of point added to current batch var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack var R [MAX_BATCH_SIZE]*G2Affine // ... @@ -279,7 +288,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if cptP == 0 { return } - BatchAddG2Affine(R[:cptP], P[:cptP], cptP) + BatchAddG2Affine(R[:cptP], P[:cptP]) for k := range bucketIds { delete(bucketIds, k) } @@ -328,24 +337,36 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. + // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. + var queue [MAX_BATCH_SIZE]batchOp + qID := 0 processQueue := func() { - // for i := len(queue) - 1; i >= 0; i-- { - for i := 0; i < len(queue); i++ { + for i := qID - 1; i >= 0; i-- { if canAdd(queue[i].bucketID) { add(queue[i]) if isFull() { executeAndReset() } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - i-- + queue[i] = queue[qID-1] + qID-- } } } - nbBatches := 0 + processTopQueue := func() { + for i := qID - 1; i >= 0; i-- { + if !canAdd(queue[i].bucketID) { + return + } + add(queue[i]) + if isFull() { + executeAndReset() + } + qID-- + } + } + for i, digit := range digits { if digit == 0 { @@ -366,22 +387,23 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, add(op) if isFull() { executeAndReset() - nbBatches++ - // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - // add(queue[len(queue)-1]) - // queue = queue[:len(queue)-1] - // } - processQueue() + processTopQueue() } } else { // put it in queue. - queue = append(queue, op) + queue[qID] = op + qID++ + if qID == MAX_BATCH_SIZE-1 { + executeAndReset() + processQueue() + } + // queue = append(queue, op) } } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() - for len(queue) != 0 { + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. } @@ -421,8 +443,6 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine type bucketG2AffineC14 [1 << (14 - 1)]G2Affine type bucketG2AffineC15 [1 << (15 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine -type bucketG2AffineC20 [1 << (20 - 1)]G2Affine -type bucketG2AffineC21 [1 << (21 - 1)]G2Affine type ibG2Affine interface { bucketG2AffineC4 | @@ -437,7 +457,5 @@ type ibG2Affine interface { bucketG2AffineC13 | bucketG2AffineC14 | bucketG2AffineC15 | - bucketG2AffineC16 | - bucketG2AffineC20 | - bucketG2AffineC21 + bucketG2AffineC16 } diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go index fccf3e949d..7e832db4e7 100644 --- a/ecc/bls24-317/multiexp_jacobian.go +++ b/ecc/bls24-317/multiexp_jacobian.go @@ -74,8 +74,6 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended -type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended @@ -94,9 +92,7 @@ type ibg1JacExtended interface { bucketg1JacExtendedC13 | bucketg1JacExtendedC14 | bucketg1JacExtendedC15 | - bucketg1JacExtendedC16 | - bucketg1JacExtendedC20 | - bucketg1JacExtendedC21 + bucketg1JacExtendedC16 } func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, @@ -157,8 +153,6 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended -type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended @@ -177,7 +171,5 @@ type ibg2JacExtended interface { bucketg2JacExtendedC13 | bucketg2JacExtendedC14 | bucketg2JacExtendedC15 | - bucketg2JacExtendedC16 | - bucketg2JacExtendedC20 | - bucketg2JacExtendedC21 + bucketg2JacExtendedC16 } diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index 799b903db7..214d884e11 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -275,17 +275,10 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 14; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) @@ -295,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) { } func BenchmarkMultiExpG1Reference(b *testing.B) { - const nbSamples = 1 << 20 + const nbSamples = 1 << 23 var ( samplePoints [nbSamples]G1Affine @@ -606,17 +599,10 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 14; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) @@ -626,7 +612,7 @@ func BenchmarkMultiExpG2(b *testing.B) { } func BenchmarkMultiExpG2Reference(b *testing.B) { - const nbSamples = 1 << 20 + const nbSamples = 1 << 23 var ( samplePoints [nbSamples]G2Affine diff --git a/ecc/bn254/g1.go b/ecc/bn254/g1.go index 0844716e0f..80cec53604 100644 --- a/ecc/bn254/g1.go +++ b/ecc/bn254/g1.go @@ -955,31 +955,27 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // batch add/dbl in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { +func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { + batchSize := len(R) if batchSize == 0 { return } var isDbl [MAX_BATCH_SIZE]bool - var lambda [MAX_BATCH_SIZE]fp.Element - - { - var lambdain [MAX_BATCH_SIZE]fp.Element - - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } - } - - // invert denominator - BatchInvertG1Affine(&lambda, &lambdain, batchSize) + var lambda, lambdain [MAX_BATCH_SIZE]fp.Element + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } } + // invert denominator + batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) + var d fp.Element var rr G1Affine @@ -1008,19 +1004,19 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { // batch inversion // similar to BatchInvertfp.Element, ignores edge cases -func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { +func batchInvertG1Affine(res, a []fp.Element) { var accumulator fp.Element accumulator.SetOne() - for i := 0; i < n; i++ { + for i := 0; i < len(res); i++ { res[i] = accumulator accumulator.Mul(&accumulator, &a[i]) } accumulator.Inverse(&accumulator) - for i := n - 1; i >= 0; i-- { + for i := len(res) - 1; i >= 0; i-- { res[i].Mul(&res[i], &accumulator) accumulator.Mul(&accumulator, &a[i]) } diff --git a/ecc/bn254/g2.go b/ecc/bn254/g2.go index 23203fd92c..79215583d5 100644 --- a/ecc/bn254/g2.go +++ b/ecc/bn254/g2.go @@ -984,31 +984,27 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // batch add/dbl in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { +func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { + batchSize := len(R) if batchSize == 0 { return } var isDbl [MAX_BATCH_SIZE]bool - var lambda [MAX_BATCH_SIZE]fptower.E2 - - { - var lambdain [MAX_BATCH_SIZE]fptower.E2 - - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } - } - - // invert denominator - BatchInvertG2Affine(&lambda, &lambdain, batchSize) + var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2 + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } } + // invert denominator + batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) + var d fptower.E2 var rr G2Affine @@ -1037,19 +1033,19 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { // batch inversion // similar to BatchInvertfptower.E2, ignores edge cases -func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E2, n int) { +func batchInvertG2Affine(res, a []fptower.E2) { var accumulator fptower.E2 accumulator.SetOne() - for i := 0; i < n; i++ { + for i := 0; i < len(res); i++ { res[i] = accumulator accumulator.Mul(&accumulator, &a[i]) } accumulator.Inverse(&accumulator) - for i := n - 1; i >= 0; i-- { + for i := len(res) - 1; i >= 0; i-- { res[i].Mul(&res[i], &accumulator) accumulator.Mul(&accumulator, &a[i]) } diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index e3c84b390f..410e4016ab 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -84,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -198,14 +198,6 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) - case 20: - processChunk := processChunkG1BatchAffine[bucketG1AffineC20] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16] - _innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) - case 21: - processChunk := processChunkG1BatchAffine[bucketG1AffineC21] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -338,7 +330,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -452,14 +444,6 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) - case 20: - processChunk := processChunkG2BatchAffine[bucketG2AffineC20] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16] - _innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk) - case 21: - processChunk := processChunkG2BatchAffine[bucketG2AffineC21] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index 42d6413264..62c70e876a 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -16,7 +16,7 @@ package bn254 -const MAX_BATCH_SIZE = 2000 +const MAX_BATCH_SIZE = 600 type batchOp struct { bucketID, pointID uint32 @@ -45,15 +45,15 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 30 + batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here - cptP := 0 // count the number of point added to current batch + bucketIds := make(map[uint32]struct{}, batchSize) + cptP := 0 // count the number of point added to current batch var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack var R [MAX_BATCH_SIZE]*G1Affine // ... @@ -71,7 +71,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if cptP == 0 { return } - BatchAddG1Affine(R[:cptP], P[:cptP], cptP) + BatchAddG1Affine(R[:cptP], P[:cptP]) for k := range bucketIds { delete(bucketIds, k) } @@ -120,24 +120,36 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. + // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. + var queue [MAX_BATCH_SIZE]batchOp + qID := 0 processQueue := func() { - // for i := len(queue) - 1; i >= 0; i-- { - for i := 0; i < len(queue); i++ { + for i := qID - 1; i >= 0; i-- { if canAdd(queue[i].bucketID) { add(queue[i]) if isFull() { executeAndReset() } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - i-- + queue[i] = queue[qID-1] + qID-- } } } - nbBatches := 0 + processTopQueue := func() { + for i := qID - 1; i >= 0; i-- { + if !canAdd(queue[i].bucketID) { + return + } + add(queue[i]) + if isFull() { + executeAndReset() + } + qID-- + } + } + for i, digit := range digits { if digit == 0 { @@ -158,22 +170,23 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, add(op) if isFull() { executeAndReset() - nbBatches++ - // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - // add(queue[len(queue)-1]) - // queue = queue[:len(queue)-1] - // } - processQueue() + processTopQueue() } } else { // put it in queue. - queue = append(queue, op) + queue[qID] = op + qID++ + if qID == MAX_BATCH_SIZE-1 { + executeAndReset() + processQueue() + } + // queue = append(queue, op) } } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() - for len(queue) != 0 { + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. } @@ -213,8 +226,6 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine type bucketG1AffineC14 [1 << (14 - 1)]G1Affine type bucketG1AffineC15 [1 << (15 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine -type bucketG1AffineC20 [1 << (20 - 1)]G1Affine -type bucketG1AffineC21 [1 << (21 - 1)]G1Affine type ibG1Affine interface { bucketG1AffineC4 | @@ -229,9 +240,7 @@ type ibG1Affine interface { bucketG1AffineC13 | bucketG1AffineC14 | bucketG1AffineC15 | - bucketG1AffineC16 | - bucketG1AffineC20 | - bucketG1AffineC21 + bucketG1AffineC16 } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -253,15 +262,15 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 30 + batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here - cptP := 0 // count the number of point added to current batch + bucketIds := make(map[uint32]struct{}, batchSize) + cptP := 0 // count the number of point added to current batch var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack var R [MAX_BATCH_SIZE]*G2Affine // ... @@ -279,7 +288,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if cptP == 0 { return } - BatchAddG2Affine(R[:cptP], P[:cptP], cptP) + BatchAddG2Affine(R[:cptP], P[:cptP]) for k := range bucketIds { delete(bucketIds, k) } @@ -328,24 +337,36 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. + // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. + var queue [MAX_BATCH_SIZE]batchOp + qID := 0 processQueue := func() { - // for i := len(queue) - 1; i >= 0; i-- { - for i := 0; i < len(queue); i++ { + for i := qID - 1; i >= 0; i-- { if canAdd(queue[i].bucketID) { add(queue[i]) if isFull() { executeAndReset() } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - i-- + queue[i] = queue[qID-1] + qID-- } } } - nbBatches := 0 + processTopQueue := func() { + for i := qID - 1; i >= 0; i-- { + if !canAdd(queue[i].bucketID) { + return + } + add(queue[i]) + if isFull() { + executeAndReset() + } + qID-- + } + } + for i, digit := range digits { if digit == 0 { @@ -366,22 +387,23 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, add(op) if isFull() { executeAndReset() - nbBatches++ - // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - // add(queue[len(queue)-1]) - // queue = queue[:len(queue)-1] - // } - processQueue() + processTopQueue() } } else { // put it in queue. - queue = append(queue, op) + queue[qID] = op + qID++ + if qID == MAX_BATCH_SIZE-1 { + executeAndReset() + processQueue() + } + // queue = append(queue, op) } } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() - for len(queue) != 0 { + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. } @@ -421,8 +443,6 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine type bucketG2AffineC14 [1 << (14 - 1)]G2Affine type bucketG2AffineC15 [1 << (15 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine -type bucketG2AffineC20 [1 << (20 - 1)]G2Affine -type bucketG2AffineC21 [1 << (21 - 1)]G2Affine type ibG2Affine interface { bucketG2AffineC4 | @@ -437,7 +457,5 @@ type ibG2Affine interface { bucketG2AffineC13 | bucketG2AffineC14 | bucketG2AffineC15 | - bucketG2AffineC16 | - bucketG2AffineC20 | - bucketG2AffineC21 + bucketG2AffineC16 } diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go index ef34f5faad..a682232ec6 100644 --- a/ecc/bn254/multiexp_jacobian.go +++ b/ecc/bn254/multiexp_jacobian.go @@ -74,8 +74,6 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended -type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended @@ -94,9 +92,7 @@ type ibg1JacExtended interface { bucketg1JacExtendedC13 | bucketg1JacExtendedC14 | bucketg1JacExtendedC15 | - bucketg1JacExtendedC16 | - bucketg1JacExtendedC20 | - bucketg1JacExtendedC21 + bucketg1JacExtendedC16 } func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, @@ -157,8 +153,6 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended -type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended @@ -177,7 +171,5 @@ type ibg2JacExtended interface { bucketg2JacExtendedC13 | bucketg2JacExtendedC14 | bucketg2JacExtendedC15 | - bucketg2JacExtendedC16 | - bucketg2JacExtendedC20 | - bucketg2JacExtendedC21 + bucketg2JacExtendedC16 } diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index 7379ddccbd..bf12818dd5 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -275,17 +275,10 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 14; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) @@ -295,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) { } func BenchmarkMultiExpG1Reference(b *testing.B) { - const nbSamples = 1 << 20 + const nbSamples = 1 << 23 var ( samplePoints [nbSamples]G1Affine @@ -606,17 +599,10 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 14; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) @@ -626,7 +612,7 @@ func BenchmarkMultiExpG2(b *testing.B) { } func BenchmarkMultiExpG2Reference(b *testing.B) { - const nbSamples = 1 << 20 + const nbSamples = 1 << 23 var ( samplePoints [nbSamples]G2Affine diff --git a/ecc/bw6-633/g1.go b/ecc/bw6-633/g1.go index 860ce3e355..41a18cf2af 100644 --- a/ecc/bw6-633/g1.go +++ b/ecc/bw6-633/g1.go @@ -1087,31 +1087,27 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // batch add/dbl in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { +func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { + batchSize := len(R) if batchSize == 0 { return } var isDbl [MAX_BATCH_SIZE]bool - var lambda [MAX_BATCH_SIZE]fp.Element - - { - var lambdain [MAX_BATCH_SIZE]fp.Element - - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } - } - - // invert denominator - BatchInvertG1Affine(&lambda, &lambdain, batchSize) + var lambda, lambdain [MAX_BATCH_SIZE]fp.Element + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } } + // invert denominator + batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) + var d fp.Element var rr G1Affine @@ -1140,19 +1136,19 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { // batch inversion // similar to BatchInvertfp.Element, ignores edge cases -func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { +func batchInvertG1Affine(res, a []fp.Element) { var accumulator fp.Element accumulator.SetOne() - for i := 0; i < n; i++ { + for i := 0; i < len(res); i++ { res[i] = accumulator accumulator.Mul(&accumulator, &a[i]) } accumulator.Inverse(&accumulator) - for i := n - 1; i >= 0; i-- { + for i := len(res) - 1; i >= 0; i-- { res[i].Mul(&res[i], &accumulator) accumulator.Mul(&accumulator, &a[i]) } diff --git a/ecc/bw6-633/g2.go b/ecc/bw6-633/g2.go index 12579994ab..de70170a12 100644 --- a/ecc/bw6-633/g2.go +++ b/ecc/bw6-633/g2.go @@ -950,31 +950,27 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // batch add/dbl in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { +func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { + batchSize := len(R) if batchSize == 0 { return } var isDbl [MAX_BATCH_SIZE]bool - var lambda [MAX_BATCH_SIZE]fp.Element - - { - var lambdain [MAX_BATCH_SIZE]fp.Element - - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } - } - - // invert denominator - BatchInvertG2Affine(&lambda, &lambdain, batchSize) + var lambda, lambdain [MAX_BATCH_SIZE]fp.Element + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } } + // invert denominator + batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) + var d fp.Element var rr G2Affine @@ -1003,19 +999,19 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { // batch inversion // similar to BatchInvertfp.Element, ignores edge cases -func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { +func batchInvertG2Affine(res, a []fp.Element) { var accumulator fp.Element accumulator.SetOne() - for i := 0; i < n; i++ { + for i := 0; i < len(res); i++ { res[i] = accumulator accumulator.Mul(&accumulator, &a[i]) } accumulator.Inverse(&accumulator) - for i := n - 1; i >= 0; i-- { + for i := len(res) - 1; i >= 0; i-- { res[i].Mul(&res[i], &accumulator) accumulator.Mul(&accumulator, &a[i]) } diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index 49484c36c9..1a7d1b4abe 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -16,7 +16,7 @@ package bw6633 -const MAX_BATCH_SIZE = 2000 +const MAX_BATCH_SIZE = 600 type batchOp struct { bucketID, pointID uint32 @@ -45,15 +45,15 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 30 + batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here - cptP := 0 // count the number of point added to current batch + bucketIds := make(map[uint32]struct{}, batchSize) + cptP := 0 // count the number of point added to current batch var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack var R [MAX_BATCH_SIZE]*G1Affine // ... @@ -71,7 +71,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if cptP == 0 { return } - BatchAddG1Affine(R[:cptP], P[:cptP], cptP) + BatchAddG1Affine(R[:cptP], P[:cptP]) for k := range bucketIds { delete(bucketIds, k) } @@ -120,24 +120,36 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. + // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. + var queue [MAX_BATCH_SIZE]batchOp + qID := 0 processQueue := func() { - // for i := len(queue) - 1; i >= 0; i-- { - for i := 0; i < len(queue); i++ { + for i := qID - 1; i >= 0; i-- { if canAdd(queue[i].bucketID) { add(queue[i]) if isFull() { executeAndReset() } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - i-- + queue[i] = queue[qID-1] + qID-- } } } - nbBatches := 0 + processTopQueue := func() { + for i := qID - 1; i >= 0; i-- { + if !canAdd(queue[i].bucketID) { + return + } + add(queue[i]) + if isFull() { + executeAndReset() + } + qID-- + } + } + for i, digit := range digits { if digit == 0 { @@ -158,22 +170,23 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, add(op) if isFull() { executeAndReset() - nbBatches++ - // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - // add(queue[len(queue)-1]) - // queue = queue[:len(queue)-1] - // } - processQueue() + processTopQueue() } } else { // put it in queue. - queue = append(queue, op) + queue[qID] = op + qID++ + if qID == MAX_BATCH_SIZE-1 { + executeAndReset() + processQueue() + } + // queue = append(queue, op) } } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() - for len(queue) != 0 { + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. } @@ -231,15 +244,15 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 30 + batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here - cptP := 0 // count the number of point added to current batch + bucketIds := make(map[uint32]struct{}, batchSize) + cptP := 0 // count the number of point added to current batch var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack var R [MAX_BATCH_SIZE]*G2Affine // ... @@ -257,7 +270,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if cptP == 0 { return } - BatchAddG2Affine(R[:cptP], P[:cptP], cptP) + BatchAddG2Affine(R[:cptP], P[:cptP]) for k := range bucketIds { delete(bucketIds, k) } @@ -306,24 +319,36 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. + // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. + var queue [MAX_BATCH_SIZE]batchOp + qID := 0 processQueue := func() { - // for i := len(queue) - 1; i >= 0; i-- { - for i := 0; i < len(queue); i++ { + for i := qID - 1; i >= 0; i-- { if canAdd(queue[i].bucketID) { add(queue[i]) if isFull() { executeAndReset() } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - i-- + queue[i] = queue[qID-1] + qID-- } } } - nbBatches := 0 + processTopQueue := func() { + for i := qID - 1; i >= 0; i-- { + if !canAdd(queue[i].bucketID) { + return + } + add(queue[i]) + if isFull() { + executeAndReset() + } + qID-- + } + } + for i, digit := range digits { if digit == 0 { @@ -344,22 +369,23 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, add(op) if isFull() { executeAndReset() - nbBatches++ - // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - // add(queue[len(queue)-1]) - // queue = queue[:len(queue)-1] - // } - processQueue() + processTopQueue() } } else { // put it in queue. - queue = append(queue, op) + queue[qID] = op + qID++ + if qID == MAX_BATCH_SIZE-1 { + executeAndReset() + processQueue() + } + // queue = append(queue, op) } } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() - for len(queue) != 0 { + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. } diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index b367a2fe2f..38f438c71e 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -275,17 +275,10 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 14; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) @@ -295,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) { } func BenchmarkMultiExpG1Reference(b *testing.B) { - const nbSamples = 1 << 20 + const nbSamples = 1 << 23 var ( samplePoints [nbSamples]G1Affine @@ -606,17 +599,10 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 14; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) @@ -626,7 +612,7 @@ func BenchmarkMultiExpG2(b *testing.B) { } func BenchmarkMultiExpG2Reference(b *testing.B) { - const nbSamples = 1 << 20 + const nbSamples = 1 << 23 var ( samplePoints [nbSamples]G2Affine diff --git a/ecc/bw6-756/g1.go b/ecc/bw6-756/g1.go index d53b7f5f82..e1c7e9056a 100644 --- a/ecc/bw6-756/g1.go +++ b/ecc/bw6-756/g1.go @@ -1087,31 +1087,27 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // batch add/dbl in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { +func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { + batchSize := len(R) if batchSize == 0 { return } var isDbl [MAX_BATCH_SIZE]bool - var lambda [MAX_BATCH_SIZE]fp.Element - - { - var lambdain [MAX_BATCH_SIZE]fp.Element - - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } - } - - // invert denominator - BatchInvertG1Affine(&lambda, &lambdain, batchSize) + var lambda, lambdain [MAX_BATCH_SIZE]fp.Element + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } } + // invert denominator + batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) + var d fp.Element var rr G1Affine @@ -1140,19 +1136,19 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { // batch inversion // similar to BatchInvertfp.Element, ignores edge cases -func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { +func batchInvertG1Affine(res, a []fp.Element) { var accumulator fp.Element accumulator.SetOne() - for i := 0; i < n; i++ { + for i := 0; i < len(res); i++ { res[i] = accumulator accumulator.Mul(&accumulator, &a[i]) } accumulator.Inverse(&accumulator) - for i := n - 1; i >= 0; i-- { + for i := len(res) - 1; i >= 0; i-- { res[i].Mul(&res[i], &accumulator) accumulator.Mul(&accumulator, &a[i]) } diff --git a/ecc/bw6-756/g2.go b/ecc/bw6-756/g2.go index 049841f4f7..5302819c4b 100644 --- a/ecc/bw6-756/g2.go +++ b/ecc/bw6-756/g2.go @@ -944,31 +944,27 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // batch add/dbl in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { +func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { + batchSize := len(R) if batchSize == 0 { return } var isDbl [MAX_BATCH_SIZE]bool - var lambda [MAX_BATCH_SIZE]fp.Element - - { - var lambdain [MAX_BATCH_SIZE]fp.Element - - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } - } - - // invert denominator - BatchInvertG2Affine(&lambda, &lambdain, batchSize) + var lambda, lambdain [MAX_BATCH_SIZE]fp.Element + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } } + // invert denominator + batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) + var d fp.Element var rr G2Affine @@ -997,19 +993,19 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { // batch inversion // similar to BatchInvertfp.Element, ignores edge cases -func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { +func batchInvertG2Affine(res, a []fp.Element) { var accumulator fp.Element accumulator.SetOne() - for i := 0; i < n; i++ { + for i := 0; i < len(res); i++ { res[i] = accumulator accumulator.Mul(&accumulator, &a[i]) } accumulator.Inverse(&accumulator) - for i := n - 1; i >= 0; i-- { + for i := len(res) - 1; i >= 0; i-- { res[i].Mul(&res[i], &accumulator) accumulator.Mul(&accumulator, &a[i]) } diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index e1daea4ffe..93b394e246 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -16,7 +16,7 @@ package bw6756 -const MAX_BATCH_SIZE = 2000 +const MAX_BATCH_SIZE = 600 type batchOp struct { bucketID, pointID uint32 @@ -45,15 +45,15 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 30 + batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here - cptP := 0 // count the number of point added to current batch + bucketIds := make(map[uint32]struct{}, batchSize) + cptP := 0 // count the number of point added to current batch var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack var R [MAX_BATCH_SIZE]*G1Affine // ... @@ -71,7 +71,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if cptP == 0 { return } - BatchAddG1Affine(R[:cptP], P[:cptP], cptP) + BatchAddG1Affine(R[:cptP], P[:cptP]) for k := range bucketIds { delete(bucketIds, k) } @@ -120,24 +120,36 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. + // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. + var queue [MAX_BATCH_SIZE]batchOp + qID := 0 processQueue := func() { - // for i := len(queue) - 1; i >= 0; i-- { - for i := 0; i < len(queue); i++ { + for i := qID - 1; i >= 0; i-- { if canAdd(queue[i].bucketID) { add(queue[i]) if isFull() { executeAndReset() } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - i-- + queue[i] = queue[qID-1] + qID-- } } } - nbBatches := 0 + processTopQueue := func() { + for i := qID - 1; i >= 0; i-- { + if !canAdd(queue[i].bucketID) { + return + } + add(queue[i]) + if isFull() { + executeAndReset() + } + qID-- + } + } + for i, digit := range digits { if digit == 0 { @@ -158,22 +170,23 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, add(op) if isFull() { executeAndReset() - nbBatches++ - // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - // add(queue[len(queue)-1]) - // queue = queue[:len(queue)-1] - // } - processQueue() + processTopQueue() } } else { // put it in queue. - queue = append(queue, op) + queue[qID] = op + qID++ + if qID == MAX_BATCH_SIZE-1 { + executeAndReset() + processQueue() + } + // queue = append(queue, op) } } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() - for len(queue) != 0 { + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. } @@ -231,15 +244,15 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 30 + batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here - cptP := 0 // count the number of point added to current batch + bucketIds := make(map[uint32]struct{}, batchSize) + cptP := 0 // count the number of point added to current batch var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack var R [MAX_BATCH_SIZE]*G2Affine // ... @@ -257,7 +270,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if cptP == 0 { return } - BatchAddG2Affine(R[:cptP], P[:cptP], cptP) + BatchAddG2Affine(R[:cptP], P[:cptP]) for k := range bucketIds { delete(bucketIds, k) } @@ -306,24 +319,36 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. + // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. + var queue [MAX_BATCH_SIZE]batchOp + qID := 0 processQueue := func() { - // for i := len(queue) - 1; i >= 0; i-- { - for i := 0; i < len(queue); i++ { + for i := qID - 1; i >= 0; i-- { if canAdd(queue[i].bucketID) { add(queue[i]) if isFull() { executeAndReset() } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - i-- + queue[i] = queue[qID-1] + qID-- } } } - nbBatches := 0 + processTopQueue := func() { + for i := qID - 1; i >= 0; i-- { + if !canAdd(queue[i].bucketID) { + return + } + add(queue[i]) + if isFull() { + executeAndReset() + } + qID-- + } + } + for i, digit := range digits { if digit == 0 { @@ -344,22 +369,23 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, add(op) if isFull() { executeAndReset() - nbBatches++ - // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - // add(queue[len(queue)-1]) - // queue = queue[:len(queue)-1] - // } - processQueue() + processTopQueue() } } else { // put it in queue. - queue = append(queue, op) + queue[qID] = op + qID++ + if qID == MAX_BATCH_SIZE-1 { + executeAndReset() + processQueue() + } + // queue = append(queue, op) } } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() - for len(queue) != 0 { + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. } diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index f82d71c32f..ee8c765cd2 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -275,17 +275,10 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 14; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) @@ -295,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) { } func BenchmarkMultiExpG1Reference(b *testing.B) { - const nbSamples = 1 << 20 + const nbSamples = 1 << 23 var ( samplePoints [nbSamples]G1Affine @@ -606,17 +599,10 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 14; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) @@ -626,7 +612,7 @@ func BenchmarkMultiExpG2(b *testing.B) { } func BenchmarkMultiExpG2Reference(b *testing.B) { - const nbSamples = 1 << 20 + const nbSamples = 1 << 23 var ( samplePoints [nbSamples]G2Affine diff --git a/ecc/bw6-761/g1.go b/ecc/bw6-761/g1.go index 8694980eda..86b99ebd1a 100644 --- a/ecc/bw6-761/g1.go +++ b/ecc/bw6-761/g1.go @@ -1098,31 +1098,27 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // batch add/dbl in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { +func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { + batchSize := len(R) if batchSize == 0 { return } var isDbl [MAX_BATCH_SIZE]bool - var lambda [MAX_BATCH_SIZE]fp.Element - - { - var lambdain [MAX_BATCH_SIZE]fp.Element - - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } - } - - // invert denominator - BatchInvertG1Affine(&lambda, &lambdain, batchSize) + var lambda, lambdain [MAX_BATCH_SIZE]fp.Element + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } } + // invert denominator + batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) + var d fp.Element var rr G1Affine @@ -1151,19 +1147,19 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) { // batch inversion // similar to BatchInvertfp.Element, ignores edge cases -func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { +func batchInvertG1Affine(res, a []fp.Element) { var accumulator fp.Element accumulator.SetOne() - for i := 0; i < n; i++ { + for i := 0; i < len(res); i++ { res[i] = accumulator accumulator.Mul(&accumulator, &a[i]) } accumulator.Inverse(&accumulator) - for i := n - 1; i >= 0; i-- { + for i := len(res) - 1; i >= 0; i-- { res[i].Mul(&res[i], &accumulator) accumulator.Mul(&accumulator, &a[i]) } diff --git a/ecc/bw6-761/g2.go b/ecc/bw6-761/g2.go index 3198411f9e..77c4e1d375 100644 --- a/ecc/bw6-761/g2.go +++ b/ecc/bw6-761/g2.go @@ -958,31 +958,27 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // batch add/dbl in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { +func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { + batchSize := len(R) if batchSize == 0 { return } var isDbl [MAX_BATCH_SIZE]bool - var lambda [MAX_BATCH_SIZE]fp.Element - - { - var lambdain [MAX_BATCH_SIZE]fp.Element - - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } - } - - // invert denominator - BatchInvertG2Affine(&lambda, &lambdain, batchSize) + var lambda, lambdain [MAX_BATCH_SIZE]fp.Element + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) + } } + // invert denominator + batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) + var d fp.Element var rr G2Affine @@ -1011,19 +1007,19 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) { // batch inversion // similar to BatchInvertfp.Element, ignores edge cases -func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) { +func batchInvertG2Affine(res, a []fp.Element) { var accumulator fp.Element accumulator.SetOne() - for i := 0; i < n; i++ { + for i := 0; i < len(res); i++ { res[i] = accumulator accumulator.Mul(&accumulator, &a[i]) } accumulator.Inverse(&accumulator) - for i := n - 1; i >= 0; i-- { + for i := len(res) - 1; i >= 0; i-- { res[i].Mul(&res[i], &accumulator) accumulator.Mul(&accumulator, &a[i]) } diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index 9edda2244b..cdd2c92daf 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -16,7 +16,7 @@ package bw6761 -const MAX_BATCH_SIZE = 2000 +const MAX_BATCH_SIZE = 600 type batchOp struct { bucketID, pointID uint32 @@ -45,15 +45,15 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 30 + batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here - cptP := 0 // count the number of point added to current batch + bucketIds := make(map[uint32]struct{}, batchSize) + cptP := 0 // count the number of point added to current batch var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack var R [MAX_BATCH_SIZE]*G1Affine // ... @@ -71,7 +71,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if cptP == 0 { return } - BatchAddG1Affine(R[:cptP], P[:cptP], cptP) + BatchAddG1Affine(R[:cptP], P[:cptP]) for k := range bucketIds { delete(bucketIds, k) } @@ -120,24 +120,36 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. + // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. + var queue [MAX_BATCH_SIZE]batchOp + qID := 0 processQueue := func() { - // for i := len(queue) - 1; i >= 0; i-- { - for i := 0; i < len(queue); i++ { + for i := qID - 1; i >= 0; i-- { if canAdd(queue[i].bucketID) { add(queue[i]) if isFull() { executeAndReset() } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - i-- + queue[i] = queue[qID-1] + qID-- } } } - nbBatches := 0 + processTopQueue := func() { + for i := qID - 1; i >= 0; i-- { + if !canAdd(queue[i].bucketID) { + return + } + add(queue[i]) + if isFull() { + executeAndReset() + } + qID-- + } + } + for i, digit := range digits { if digit == 0 { @@ -158,22 +170,23 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, add(op) if isFull() { executeAndReset() - nbBatches++ - // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - // add(queue[len(queue)-1]) - // queue = queue[:len(queue)-1] - // } - processQueue() + processTopQueue() } } else { // put it in queue. - queue = append(queue, op) + queue[qID] = op + qID++ + if qID == MAX_BATCH_SIZE-1 { + executeAndReset() + processQueue() + } + // queue = append(queue, op) } } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() - for len(queue) != 0 { + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. } @@ -231,15 +244,15 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; - batchSize := len(buckets) / 30 + batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here - cptP := 0 // count the number of point added to current batch + bucketIds := make(map[uint32]struct{}, batchSize) + cptP := 0 // count the number of point added to current batch var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack var R [MAX_BATCH_SIZE]*G2Affine // ... @@ -257,7 +270,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if cptP == 0 { return } - BatchAddG2Affine(R[:cptP], P[:cptP], cptP) + BatchAddG2Affine(R[:cptP], P[:cptP]) for k := range bucketIds { delete(bucketIds, k) } @@ -306,24 +319,36 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here. + // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. + var queue [MAX_BATCH_SIZE]batchOp + qID := 0 processQueue := func() { - // for i := len(queue) - 1; i >= 0; i-- { - for i := 0; i < len(queue); i++ { + for i := qID - 1; i >= 0; i-- { if canAdd(queue[i].bucketID) { add(queue[i]) if isFull() { executeAndReset() } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - i-- + queue[i] = queue[qID-1] + qID-- } } } - nbBatches := 0 + processTopQueue := func() { + for i := qID - 1; i >= 0; i-- { + if !canAdd(queue[i].bucketID) { + return + } + add(queue[i]) + if isFull() { + executeAndReset() + } + qID-- + } + } + for i, digit := range digits { if digit == 0 { @@ -344,22 +369,23 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, add(op) if isFull() { executeAndReset() - nbBatches++ - // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - // add(queue[len(queue)-1]) - // queue = queue[:len(queue)-1] - // } - processQueue() + processTopQueue() } } else { // put it in queue. - queue = append(queue, op) + queue[qID] = op + qID++ + if qID == MAX_BATCH_SIZE-1 { + executeAndReset() + processQueue() + } + // queue = append(queue, op) } } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() - for len(queue) != 0 { + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. } diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index 884a8564f8..5cab324216 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -275,17 +275,10 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 14; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) @@ -295,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) { } func BenchmarkMultiExpG1Reference(b *testing.B) { - const nbSamples = 1 << 20 + const nbSamples = 1 << 23 var ( samplePoints [nbSamples]G1Affine @@ -606,17 +599,10 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 14; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) @@ -626,7 +612,7 @@ func BenchmarkMultiExpG2(b *testing.B) { } func BenchmarkMultiExpG2Reference(b *testing.B) { - const nbSamples = 1 << 20 + const nbSamples = 1 << 23 var ( samplePoints [nbSamples]G2Affine diff --git a/internal/generator/config/curve.go b/internal/generator/config/curve.go index e1df940957..1ff4926ccf 100644 --- a/internal/generator/config/curve.go +++ b/internal/generator/config/curve.go @@ -68,7 +68,7 @@ var TwistedEdwardsCurves []TwistedEdwardsCurve func defaultCRange() []int { // default range for C values in the multiExp - return []int{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + return []int{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} } func addCurve(c *Curve) { diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index a075f3b083..897401430f 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -7,7 +7,7 @@ {{ $G2TJacobianExtended := print (toLower .G2.PointName) "JacExtended" }} -const MAX_BATCH_SIZE = 2000 +const MAX_BATCH_SIZE = 600 type batchOp struct { bucketID, pointID uint32 @@ -46,14 +46,14 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64 } // setup for the batch affine; - batchSize := len(buckets) / 30 + batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE } if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here + bucketIds := make(map[uint32]struct{}, batchSize) cptP := 0 // count the number of point added to current batch var P [MAX_BATCH_SIZE]{{ $.TAffine }} // allocated on the stack @@ -72,7 +72,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64 if cptP == 0 { return } - BatchAdd{{ $.TAffine }}(R[:cptP], P[:cptP], cptP) + BatchAdd{{ $.TAffine }}(R[:cptP], P[:cptP]) for k := range bucketIds { delete(bucketIds, k) } @@ -122,25 +122,36 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64 } - queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. - + // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. + var queue [MAX_BATCH_SIZE]batchOp + qID := 0 processQueue := func () { - // for i := len(queue) - 1; i >= 0; i-- { - for i := 0; i < len(queue); i++ { + for i := qID - 1; i >= 0; i-- { if canAdd(queue[i].bucketID) { add(queue[i]) if isFull() { executeAndReset() } - queue[i] = queue[len(queue)-1] - queue = queue[:len(queue)-1] - i-- + queue[i] = queue[qID-1] + qID-- } } } - nbBatches := 0 + processTopQueue := func() { + for i := qID - 1; i >= 0; i-- { + if !canAdd(queue[i].bucketID) { + return + } + add(queue[i]) + if isFull() { + executeAndReset() + } + qID-- + } + } + for i, digit := range digits { if digit == 0 { @@ -161,22 +172,23 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64 add(op) if isFull() { executeAndReset() - nbBatches++ - // if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing - // add(queue[len(queue)-1]) - // queue = queue[:len(queue)-1] - // } - processQueue() + processTopQueue() } } else { // put it in queue. - queue = append(queue, op) + queue[qID] = op + qID++ + if qID == MAX_BATCH_SIZE - 1 { + executeAndReset() + processQueue() + } + // queue = append(queue, op) } } // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) // executeAndReset() - for len(queue) != 0 { + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. } diff --git a/internal/generator/ecc/template/point.go.tmpl b/internal/generator/ecc/template/point.go.tmpl index 9fc9cc1651..c88ca29b88 100644 --- a/internal/generator/ecc/template/point.go.tmpl +++ b/internal/generator/ecc/template/point.go.tmpl @@ -1574,32 +1574,28 @@ func BatchScalarMultiplication{{ toUpper .PointName }}(base *{{ $TAffine }}, sca // batch add/dbl in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAdd{{ $TAffine }}(R []*{{ $TAffine }}, P []{{ $TAffine }}, batchSize int) { +func BatchAdd{{ $TAffine }}(R []*{{ $TAffine }}, P []{{ $TAffine }}) { + batchSize := len(R) if batchSize == 0 { return } var isDbl [MAX_BATCH_SIZE]bool - var lambda [MAX_BATCH_SIZE]{{.CoordType}} + var lambda, lambdain [MAX_BATCH_SIZE]{{.CoordType}} - { - var lambdain [MAX_BATCH_SIZE]{{.CoordType}} - - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + for j := 0; j < batchSize; j++ { + // detect dbl vs add & compute denominator + if P[j].Equal(R[j]) { + isDbl[j] = true + lambdain[j].Double(&P[j].Y) + } else { + lambdain[j].Sub(&P[j].X, &R[j].X) } - - // invert denominator - BatchInvert{{ $TAffine }}(&lambda, &lambdain, batchSize) - } + // invert denominator + batchInvert{{ $TAffine }}(lambda[:batchSize], lambdain[:batchSize]) + var d {{.CoordType}} var rr {{ $TAffine }} @@ -1630,19 +1626,19 @@ func BatchAdd{{ $TAffine }}(R []*{{ $TAffine }}, P []{{ $TAffine }}, batchSize i // batch inversion // similar to BatchInvert{{.CoordType}}, ignores edge cases -func BatchInvert{{ $TAffine }}(res, a *[MAX_BATCH_SIZE]{{.CoordType}}, n int) { +func batchInvert{{ $TAffine }}(res, a []{{.CoordType}}) { var accumulator {{.CoordType}} accumulator.SetOne() - for i := 0; i < n; i++ { + for i := 0; i < len(res); i++ { res[i] = accumulator accumulator.Mul(&accumulator, &a[i]) } accumulator.Inverse(&accumulator) - for i := n - 1; i >= 0; i-- { + for i := len(res) - 1; i >= 0; i-- { res[i].Mul(&res[i], &accumulator) accumulator.Mul(&accumulator, &a[i]) } diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index c673254b2f..f75da11996 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -269,6 +269,8 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } + + func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { @@ -285,19 +287,13 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { fillBenchScalars(sampleScalars[:]) fillBenchBases{{ toUpper $.PointName }}(samplePoints[:]) + var testPoint {{ $.TAffine }} - for i := 14; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i - b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalars[:using],ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) { + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { b.ResetTimer() for j := 0; j < b.N; j++ { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using],ecc.MultiExpConfig{}) @@ -308,7 +304,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { func BenchmarkMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) { - const nbSamples = 1 << 20 + const nbSamples = 1 << 23 var ( samplePoints [nbSamples]{{ $.TAffine }} From bc85933359b7e97d48eb5b3f7677c5514f266bab Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Wed, 9 Nov 2022 12:27:45 -0600 Subject: [PATCH 10/43] test: gen scalars and bases in parallel --- ecc/bls12-377/multiexp_test.go | 55 ++++++++++--------- ecc/bls12-378/multiexp_test.go | 55 ++++++++++--------- ecc/bls12-381/multiexp_test.go | 55 ++++++++++--------- ecc/bls24-315/multiexp_test.go | 55 ++++++++++--------- ecc/bls24-317/multiexp_test.go | 55 ++++++++++--------- ecc/bn254/multiexp_test.go | 55 ++++++++++--------- ecc/bw6-633/multiexp_test.go | 55 ++++++++++--------- ecc/bw6-756/multiexp_test.go | 55 ++++++++++--------- ecc/bw6-761/multiexp_test.go | 55 ++++++++++--------- .../ecc/template/tests/multiexp.go.tmpl | 33 ++++++----- 10 files changed, 289 insertions(+), 239 deletions(-) diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index ca2e50f59c..e9e7fd9c67 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -17,7 +17,9 @@ package bls12377 import ( + rrand "crypto/rand" "fmt" + "math" "math/big" "math/bits" "math/rand" @@ -28,6 +30,7 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" + "github.com/consensys/gnark-crypto/internal/parallel" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" ) @@ -345,17 +348,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG1(samplePoints []G1Affine) { - var r big.Int - r.SetString("340444420969191673093399857471996460938405", 10) - samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) - - one := samplePoints[0].X - one.SetOne() - - for i := 1; i < len(samplePoints); i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &one) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) - } + max := new(big.Int).SetInt64(math.MaxInt64) + parallel.Execute(len(samplePoints), func(start, end int) { + r, _ := rrand.Int(rrand.Reader, max) + samplePoints[start].ScalarMultiplication(&g1GenAff, r) + rr := samplePoints[start].X + rr.SetOne() + for i := start + 1; i < end; i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) + } + }) } func TestMultiExpG2(t *testing.T) { @@ -669,22 +672,24 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG2(samplePoints []G2Affine) { - var r big.Int - r.SetString("340444420969191673093399857471996460938405", 10) - samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) - - one := samplePoints[0].X - one.SetOne() - - for i := 1; i < len(samplePoints); i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &one) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) - } + max := new(big.Int).SetInt64(math.MaxInt64) + parallel.Execute(len(samplePoints), func(start, end int) { + r, _ := rrand.Int(rrand.Reader, max) + samplePoints[start].ScalarMultiplication(&g2GenAff, r) + rr := samplePoints[start].X + rr.SetOne() + for i := start + 1; i < end; i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) + } + }) } func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - for i := 0; i < len(sampleScalars); i++ { - sampleScalars[i].SetRandom() - } + parallel.Execute(len(sampleScalars), func(start, end int) { + for i := start; i < end; i++ { + sampleScalars[i].SetRandom() + } + }) } diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index 339323bbac..65729bd9c1 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -17,7 +17,9 @@ package bls12378 import ( + rrand "crypto/rand" "fmt" + "math" "math/big" "math/bits" "math/rand" @@ -28,6 +30,7 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-378/fr" + "github.com/consensys/gnark-crypto/internal/parallel" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" ) @@ -345,17 +348,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG1(samplePoints []G1Affine) { - var r big.Int - r.SetString("340444420969191673093399857471996460938405", 10) - samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) - - one := samplePoints[0].X - one.SetOne() - - for i := 1; i < len(samplePoints); i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &one) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) - } + max := new(big.Int).SetInt64(math.MaxInt64) + parallel.Execute(len(samplePoints), func(start, end int) { + r, _ := rrand.Int(rrand.Reader, max) + samplePoints[start].ScalarMultiplication(&g1GenAff, r) + rr := samplePoints[start].X + rr.SetOne() + for i := start + 1; i < end; i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) + } + }) } func TestMultiExpG2(t *testing.T) { @@ -669,22 +672,24 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG2(samplePoints []G2Affine) { - var r big.Int - r.SetString("340444420969191673093399857471996460938405", 10) - samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) - - one := samplePoints[0].X - one.SetOne() - - for i := 1; i < len(samplePoints); i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &one) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) - } + max := new(big.Int).SetInt64(math.MaxInt64) + parallel.Execute(len(samplePoints), func(start, end int) { + r, _ := rrand.Int(rrand.Reader, max) + samplePoints[start].ScalarMultiplication(&g2GenAff, r) + rr := samplePoints[start].X + rr.SetOne() + for i := start + 1; i < end; i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) + } + }) } func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - for i := 0; i < len(sampleScalars); i++ { - sampleScalars[i].SetRandom() - } + parallel.Execute(len(sampleScalars), func(start, end int) { + for i := start; i < end; i++ { + sampleScalars[i].SetRandom() + } + }) } diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index ce2153872e..4b357be4d9 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -17,7 +17,9 @@ package bls12381 import ( + rrand "crypto/rand" "fmt" + "math" "math/big" "math/bits" "math/rand" @@ -28,6 +30,7 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-381/fr" + "github.com/consensys/gnark-crypto/internal/parallel" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" ) @@ -345,17 +348,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG1(samplePoints []G1Affine) { - var r big.Int - r.SetString("340444420969191673093399857471996460938405", 10) - samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) - - one := samplePoints[0].X - one.SetOne() - - for i := 1; i < len(samplePoints); i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &one) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) - } + max := new(big.Int).SetInt64(math.MaxInt64) + parallel.Execute(len(samplePoints), func(start, end int) { + r, _ := rrand.Int(rrand.Reader, max) + samplePoints[start].ScalarMultiplication(&g1GenAff, r) + rr := samplePoints[start].X + rr.SetOne() + for i := start + 1; i < end; i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) + } + }) } func TestMultiExpG2(t *testing.T) { @@ -669,22 +672,24 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG2(samplePoints []G2Affine) { - var r big.Int - r.SetString("340444420969191673093399857471996460938405", 10) - samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) - - one := samplePoints[0].X - one.SetOne() - - for i := 1; i < len(samplePoints); i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &one) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) - } + max := new(big.Int).SetInt64(math.MaxInt64) + parallel.Execute(len(samplePoints), func(start, end int) { + r, _ := rrand.Int(rrand.Reader, max) + samplePoints[start].ScalarMultiplication(&g2GenAff, r) + rr := samplePoints[start].X + rr.SetOne() + for i := start + 1; i < end; i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) + } + }) } func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - for i := 0; i < len(sampleScalars); i++ { - sampleScalars[i].SetRandom() - } + parallel.Execute(len(sampleScalars), func(start, end int) { + for i := start; i < end; i++ { + sampleScalars[i].SetRandom() + } + }) } diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index f8513bd3a1..0773215145 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -17,7 +17,9 @@ package bls24315 import ( + rrand "crypto/rand" "fmt" + "math" "math/big" "math/bits" "math/rand" @@ -28,6 +30,7 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls24-315/fr" + "github.com/consensys/gnark-crypto/internal/parallel" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" ) @@ -345,17 +348,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG1(samplePoints []G1Affine) { - var r big.Int - r.SetString("340444420969191673093399857471996460938405", 10) - samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) - - one := samplePoints[0].X - one.SetOne() - - for i := 1; i < len(samplePoints); i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &one) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) - } + max := new(big.Int).SetInt64(math.MaxInt64) + parallel.Execute(len(samplePoints), func(start, end int) { + r, _ := rrand.Int(rrand.Reader, max) + samplePoints[start].ScalarMultiplication(&g1GenAff, r) + rr := samplePoints[start].X + rr.SetOne() + for i := start + 1; i < end; i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) + } + }) } func TestMultiExpG2(t *testing.T) { @@ -669,22 +672,24 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG2(samplePoints []G2Affine) { - var r big.Int - r.SetString("340444420969191673093399857471996460938405", 10) - samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) - - one := samplePoints[0].X - one.SetOne() - - for i := 1; i < len(samplePoints); i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &one) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) - } + max := new(big.Int).SetInt64(math.MaxInt64) + parallel.Execute(len(samplePoints), func(start, end int) { + r, _ := rrand.Int(rrand.Reader, max) + samplePoints[start].ScalarMultiplication(&g2GenAff, r) + rr := samplePoints[start].X + rr.SetOne() + for i := start + 1; i < end; i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) + } + }) } func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - for i := 0; i < len(sampleScalars); i++ { - sampleScalars[i].SetRandom() - } + parallel.Execute(len(sampleScalars), func(start, end int) { + for i := start; i < end; i++ { + sampleScalars[i].SetRandom() + } + }) } diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index 214d884e11..9ebe0c4217 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -17,7 +17,9 @@ package bls24317 import ( + rrand "crypto/rand" "fmt" + "math" "math/big" "math/bits" "math/rand" @@ -28,6 +30,7 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls24-317/fr" + "github.com/consensys/gnark-crypto/internal/parallel" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" ) @@ -345,17 +348,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG1(samplePoints []G1Affine) { - var r big.Int - r.SetString("340444420969191673093399857471996460938405", 10) - samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) - - one := samplePoints[0].X - one.SetOne() - - for i := 1; i < len(samplePoints); i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &one) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) - } + max := new(big.Int).SetInt64(math.MaxInt64) + parallel.Execute(len(samplePoints), func(start, end int) { + r, _ := rrand.Int(rrand.Reader, max) + samplePoints[start].ScalarMultiplication(&g1GenAff, r) + rr := samplePoints[start].X + rr.SetOne() + for i := start + 1; i < end; i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) + } + }) } func TestMultiExpG2(t *testing.T) { @@ -669,22 +672,24 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG2(samplePoints []G2Affine) { - var r big.Int - r.SetString("340444420969191673093399857471996460938405", 10) - samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) - - one := samplePoints[0].X - one.SetOne() - - for i := 1; i < len(samplePoints); i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &one) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) - } + max := new(big.Int).SetInt64(math.MaxInt64) + parallel.Execute(len(samplePoints), func(start, end int) { + r, _ := rrand.Int(rrand.Reader, max) + samplePoints[start].ScalarMultiplication(&g2GenAff, r) + rr := samplePoints[start].X + rr.SetOne() + for i := start + 1; i < end; i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) + } + }) } func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - for i := 0; i < len(sampleScalars); i++ { - sampleScalars[i].SetRandom() - } + parallel.Execute(len(sampleScalars), func(start, end int) { + for i := start; i < end; i++ { + sampleScalars[i].SetRandom() + } + }) } diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index bf12818dd5..f77115cab8 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -17,7 +17,9 @@ package bn254 import ( + rrand "crypto/rand" "fmt" + "math" "math/big" "math/bits" "math/rand" @@ -28,6 +30,7 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bn254/fr" + "github.com/consensys/gnark-crypto/internal/parallel" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" ) @@ -345,17 +348,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG1(samplePoints []G1Affine) { - var r big.Int - r.SetString("340444420969191673093399857471996460938405", 10) - samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) - - one := samplePoints[0].X - one.SetOne() - - for i := 1; i < len(samplePoints); i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &one) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) - } + max := new(big.Int).SetInt64(math.MaxInt64) + parallel.Execute(len(samplePoints), func(start, end int) { + r, _ := rrand.Int(rrand.Reader, max) + samplePoints[start].ScalarMultiplication(&g1GenAff, r) + rr := samplePoints[start].X + rr.SetOne() + for i := start + 1; i < end; i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) + } + }) } func TestMultiExpG2(t *testing.T) { @@ -669,22 +672,24 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG2(samplePoints []G2Affine) { - var r big.Int - r.SetString("340444420969191673093399857471996460938405", 10) - samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) - - one := samplePoints[0].X - one.SetOne() - - for i := 1; i < len(samplePoints); i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &one) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) - } + max := new(big.Int).SetInt64(math.MaxInt64) + parallel.Execute(len(samplePoints), func(start, end int) { + r, _ := rrand.Int(rrand.Reader, max) + samplePoints[start].ScalarMultiplication(&g2GenAff, r) + rr := samplePoints[start].X + rr.SetOne() + for i := start + 1; i < end; i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) + } + }) } func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - for i := 0; i < len(sampleScalars); i++ { - sampleScalars[i].SetRandom() - } + parallel.Execute(len(sampleScalars), func(start, end int) { + for i := start; i < end; i++ { + sampleScalars[i].SetRandom() + } + }) } diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index 38f438c71e..6946fe3b65 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -17,7 +17,9 @@ package bw6633 import ( + rrand "crypto/rand" "fmt" + "math" "math/big" "math/bits" "math/rand" @@ -28,6 +30,7 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-633/fr" + "github.com/consensys/gnark-crypto/internal/parallel" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" ) @@ -345,17 +348,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG1(samplePoints []G1Affine) { - var r big.Int - r.SetString("340444420969191673093399857471996460938405", 10) - samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) - - one := samplePoints[0].X - one.SetOne() - - for i := 1; i < len(samplePoints); i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &one) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) - } + max := new(big.Int).SetInt64(math.MaxInt64) + parallel.Execute(len(samplePoints), func(start, end int) { + r, _ := rrand.Int(rrand.Reader, max) + samplePoints[start].ScalarMultiplication(&g1GenAff, r) + rr := samplePoints[start].X + rr.SetOne() + for i := start + 1; i < end; i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) + } + }) } func TestMultiExpG2(t *testing.T) { @@ -669,22 +672,24 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG2(samplePoints []G2Affine) { - var r big.Int - r.SetString("340444420969191673093399857471996460938405", 10) - samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) - - one := samplePoints[0].X - one.SetOne() - - for i := 1; i < len(samplePoints); i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &one) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) - } + max := new(big.Int).SetInt64(math.MaxInt64) + parallel.Execute(len(samplePoints), func(start, end int) { + r, _ := rrand.Int(rrand.Reader, max) + samplePoints[start].ScalarMultiplication(&g2GenAff, r) + rr := samplePoints[start].X + rr.SetOne() + for i := start + 1; i < end; i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) + } + }) } func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - for i := 0; i < len(sampleScalars); i++ { - sampleScalars[i].SetRandom() - } + parallel.Execute(len(sampleScalars), func(start, end int) { + for i := start; i < end; i++ { + sampleScalars[i].SetRandom() + } + }) } diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index ee8c765cd2..73217d1e6a 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -17,7 +17,9 @@ package bw6756 import ( + rrand "crypto/rand" "fmt" + "math" "math/big" "math/bits" "math/rand" @@ -28,6 +30,7 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-756/fr" + "github.com/consensys/gnark-crypto/internal/parallel" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" ) @@ -345,17 +348,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG1(samplePoints []G1Affine) { - var r big.Int - r.SetString("340444420969191673093399857471996460938405", 10) - samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) - - one := samplePoints[0].X - one.SetOne() - - for i := 1; i < len(samplePoints); i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &one) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) - } + max := new(big.Int).SetInt64(math.MaxInt64) + parallel.Execute(len(samplePoints), func(start, end int) { + r, _ := rrand.Int(rrand.Reader, max) + samplePoints[start].ScalarMultiplication(&g1GenAff, r) + rr := samplePoints[start].X + rr.SetOne() + for i := start + 1; i < end; i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) + } + }) } func TestMultiExpG2(t *testing.T) { @@ -669,22 +672,24 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG2(samplePoints []G2Affine) { - var r big.Int - r.SetString("340444420969191673093399857471996460938405", 10) - samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) - - one := samplePoints[0].X - one.SetOne() - - for i := 1; i < len(samplePoints); i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &one) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) - } + max := new(big.Int).SetInt64(math.MaxInt64) + parallel.Execute(len(samplePoints), func(start, end int) { + r, _ := rrand.Int(rrand.Reader, max) + samplePoints[start].ScalarMultiplication(&g2GenAff, r) + rr := samplePoints[start].X + rr.SetOne() + for i := start + 1; i < end; i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) + } + }) } func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - for i := 0; i < len(sampleScalars); i++ { - sampleScalars[i].SetRandom() - } + parallel.Execute(len(sampleScalars), func(start, end int) { + for i := start; i < end; i++ { + sampleScalars[i].SetRandom() + } + }) } diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index 5cab324216..e36993cfff 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -17,7 +17,9 @@ package bw6761 import ( + rrand "crypto/rand" "fmt" + "math" "math/big" "math/bits" "math/rand" @@ -28,6 +30,7 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" + "github.com/consensys/gnark-crypto/internal/parallel" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" ) @@ -345,17 +348,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG1(samplePoints []G1Affine) { - var r big.Int - r.SetString("340444420969191673093399857471996460938405", 10) - samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) - - one := samplePoints[0].X - one.SetOne() - - for i := 1; i < len(samplePoints); i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &one) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) - } + max := new(big.Int).SetInt64(math.MaxInt64) + parallel.Execute(len(samplePoints), func(start, end int) { + r, _ := rrand.Int(rrand.Reader, max) + samplePoints[start].ScalarMultiplication(&g1GenAff, r) + rr := samplePoints[start].X + rr.SetOne() + for i := start + 1; i < end; i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) + } + }) } func TestMultiExpG2(t *testing.T) { @@ -669,22 +672,24 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG2(samplePoints []G2Affine) { - var r big.Int - r.SetString("340444420969191673093399857471996460938405", 10) - samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) - - one := samplePoints[0].X - one.SetOne() - - for i := 1; i < len(samplePoints); i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &one) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) - } + max := new(big.Int).SetInt64(math.MaxInt64) + parallel.Execute(len(samplePoints), func(start, end int) { + r, _ := rrand.Int(rrand.Reader, max) + samplePoints[start].ScalarMultiplication(&g2GenAff, r) + rr := samplePoints[start].X + rr.SetOne() + for i := start + 1; i < end; i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) + } + }) } func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - for i := 0; i < len(sampleScalars); i++ { - sampleScalars[i].SetRandom() - } + parallel.Execute(len(sampleScalars), func(start, end int) { + for i := start; i < end; i++ { + sampleScalars[i].SetRandom() + } + }) } diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index f75da11996..25102e24d0 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -11,12 +11,15 @@ import ( "fmt" "time" "math/rand" + rrand "crypto/rand" "math/big" "testing" "runtime" "math/bits" "sync" + "math" + "github.com/consensys/gnark-crypto/internal/parallel" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr" "github.com/leanovate/gopter" @@ -363,24 +366,26 @@ func BenchmarkManyMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBases{{ toUpper $.PointName }}(samplePoints []{{ $.TAffine }}) { - var r big.Int - r.SetString("340444420969191673093399857471996460938405", 10) - samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) - - one := samplePoints[0].X - one.SetOne() - - for i := 1; i < len(samplePoints); i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &one) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) - } + max := new(big.Int).SetInt64(math.MaxInt64) + parallel.Execute(len(samplePoints), func(start, end int) { + r, _ := rrand.Int(rrand.Reader, max) + samplePoints[start].ScalarMultiplication(&{{$.PointName}}GenAff, r) + rr := samplePoints[start].X + rr.SetOne() + for i := start+1; i < end; i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) + } + }) } {{end }} func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - for i := 0; i < len(sampleScalars); i++ { - sampleScalars[i].SetRandom() - } + parallel.Execute(len(sampleScalars), func(start, end int) { + for i := start; i < end; i++ { + sampleScalars[i].SetRandom() + } + }) } From 091d0d56443dec14cd2884a76f2158bb61f5714b Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Wed, 9 Nov 2022 13:49:55 -0600 Subject: [PATCH 11/43] test: add BatchAdd benchmark --- ecc/bls12-377/g1_test.go | 27 +++++++++++++++++++ ecc/bls12-377/g2_test.go | 27 +++++++++++++++++++ ecc/bls12-378/g1_test.go | 27 +++++++++++++++++++ ecc/bls12-378/g2_test.go | 27 +++++++++++++++++++ ecc/bls12-381/g1_test.go | 27 +++++++++++++++++++ ecc/bls12-381/g2_test.go | 27 +++++++++++++++++++ ecc/bls24-315/g1_test.go | 27 +++++++++++++++++++ ecc/bls24-315/g2_test.go | 27 +++++++++++++++++++ ecc/bls24-317/g1_test.go | 27 +++++++++++++++++++ ecc/bls24-317/g2_test.go | 27 +++++++++++++++++++ ecc/bn254/g1_test.go | 27 +++++++++++++++++++ ecc/bn254/g2_test.go | 27 +++++++++++++++++++ ecc/bw6-633/g1_test.go | 27 +++++++++++++++++++ ecc/bw6-633/g2_test.go | 27 +++++++++++++++++++ ecc/bw6-756/g1_test.go | 27 +++++++++++++++++++ ecc/bw6-756/g2_test.go | 27 +++++++++++++++++++ ecc/bw6-761/g1_test.go | 27 +++++++++++++++++++ ecc/bw6-761/g2_test.go | 27 +++++++++++++++++++ .../ecc/template/tests/point.go.tmpl | 27 +++++++++++++++++++ 19 files changed, 513 insertions(+) diff --git a/ecc/bls12-377/g1_test.go b/ecc/bls12-377/g1_test.go index 3209de0cd2..afb23458b1 100644 --- a/ecc/bls12-377/g1_test.go +++ b/ecc/bls12-377/g1_test.go @@ -19,6 +19,7 @@ package bls12377 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-377/fp" @@ -499,6 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } +func BenchmarkBatchAddG1Affine(b *testing.B) { + var P, R [MAX_BATCH_SIZE]G1Affine + var RR [MAX_BATCH_SIZE]*G1Affine + var ridx [MAX_BATCH_SIZE]int + + fillBenchBasesG1(P[:]) + fillBenchBasesG1(R[:]) + + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } + + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + + for i, ri := range ridx { + RR[i] = &R[ri] + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + BatchAddG1Affine(RR[:], P[:]) + } + +} + func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled var mixer fr.Element diff --git a/ecc/bls12-377/g2_test.go b/ecc/bls12-377/g2_test.go index d3b0af12be..52e3ff41c1 100644 --- a/ecc/bls12-377/g2_test.go +++ b/ecc/bls12-377/g2_test.go @@ -19,6 +19,7 @@ package bls12377 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-377/internal/fptower" @@ -505,6 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } +func BenchmarkBatchAddG2Affine(b *testing.B) { + var P, R [MAX_BATCH_SIZE]G2Affine + var RR [MAX_BATCH_SIZE]*G2Affine + var ridx [MAX_BATCH_SIZE]int + + fillBenchBasesG2(P[:]) + fillBenchBasesG2(R[:]) + + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } + + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + + for i, ri := range ridx { + RR[i] = &R[ri] + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + BatchAddG2Affine(RR[:], P[:]) + } + +} + func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled var mixer fr.Element diff --git a/ecc/bls12-378/g1_test.go b/ecc/bls12-378/g1_test.go index dccaca15c7..3859bb2695 100644 --- a/ecc/bls12-378/g1_test.go +++ b/ecc/bls12-378/g1_test.go @@ -19,6 +19,7 @@ package bls12378 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-378/fp" @@ -499,6 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } +func BenchmarkBatchAddG1Affine(b *testing.B) { + var P, R [MAX_BATCH_SIZE]G1Affine + var RR [MAX_BATCH_SIZE]*G1Affine + var ridx [MAX_BATCH_SIZE]int + + fillBenchBasesG1(P[:]) + fillBenchBasesG1(R[:]) + + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } + + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + + for i, ri := range ridx { + RR[i] = &R[ri] + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + BatchAddG1Affine(RR[:], P[:]) + } + +} + func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled var mixer fr.Element diff --git a/ecc/bls12-378/g2_test.go b/ecc/bls12-378/g2_test.go index 21e79d7238..f81d14069b 100644 --- a/ecc/bls12-378/g2_test.go +++ b/ecc/bls12-378/g2_test.go @@ -19,6 +19,7 @@ package bls12378 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-378/internal/fptower" @@ -505,6 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } +func BenchmarkBatchAddG2Affine(b *testing.B) { + var P, R [MAX_BATCH_SIZE]G2Affine + var RR [MAX_BATCH_SIZE]*G2Affine + var ridx [MAX_BATCH_SIZE]int + + fillBenchBasesG2(P[:]) + fillBenchBasesG2(R[:]) + + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } + + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + + for i, ri := range ridx { + RR[i] = &R[ri] + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + BatchAddG2Affine(RR[:], P[:]) + } + +} + func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled var mixer fr.Element diff --git a/ecc/bls12-381/g1_test.go b/ecc/bls12-381/g1_test.go index 9aa3311f05..ee4ce9fb21 100644 --- a/ecc/bls12-381/g1_test.go +++ b/ecc/bls12-381/g1_test.go @@ -19,6 +19,7 @@ package bls12381 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-381/fp" @@ -499,6 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } +func BenchmarkBatchAddG1Affine(b *testing.B) { + var P, R [MAX_BATCH_SIZE]G1Affine + var RR [MAX_BATCH_SIZE]*G1Affine + var ridx [MAX_BATCH_SIZE]int + + fillBenchBasesG1(P[:]) + fillBenchBasesG1(R[:]) + + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } + + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + + for i, ri := range ridx { + RR[i] = &R[ri] + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + BatchAddG1Affine(RR[:], P[:]) + } + +} + func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled var mixer fr.Element diff --git a/ecc/bls12-381/g2_test.go b/ecc/bls12-381/g2_test.go index c259606622..a243b65b01 100644 --- a/ecc/bls12-381/g2_test.go +++ b/ecc/bls12-381/g2_test.go @@ -19,6 +19,7 @@ package bls12381 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-381/internal/fptower" @@ -505,6 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } +func BenchmarkBatchAddG2Affine(b *testing.B) { + var P, R [MAX_BATCH_SIZE]G2Affine + var RR [MAX_BATCH_SIZE]*G2Affine + var ridx [MAX_BATCH_SIZE]int + + fillBenchBasesG2(P[:]) + fillBenchBasesG2(R[:]) + + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } + + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + + for i, ri := range ridx { + RR[i] = &R[ri] + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + BatchAddG2Affine(RR[:], P[:]) + } + +} + func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled var mixer fr.Element diff --git a/ecc/bls24-315/g1_test.go b/ecc/bls24-315/g1_test.go index 5eba73ee93..d1061a803e 100644 --- a/ecc/bls24-315/g1_test.go +++ b/ecc/bls24-315/g1_test.go @@ -19,6 +19,7 @@ package bls24315 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls24-315/fp" @@ -499,6 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } +func BenchmarkBatchAddG1Affine(b *testing.B) { + var P, R [MAX_BATCH_SIZE]G1Affine + var RR [MAX_BATCH_SIZE]*G1Affine + var ridx [MAX_BATCH_SIZE]int + + fillBenchBasesG1(P[:]) + fillBenchBasesG1(R[:]) + + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } + + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + + for i, ri := range ridx { + RR[i] = &R[ri] + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + BatchAddG1Affine(RR[:], P[:]) + } + +} + func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled var mixer fr.Element diff --git a/ecc/bls24-315/g2_test.go b/ecc/bls24-315/g2_test.go index bab8fbad10..ccdac4012c 100644 --- a/ecc/bls24-315/g2_test.go +++ b/ecc/bls24-315/g2_test.go @@ -19,6 +19,7 @@ package bls24315 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls24-315/internal/fptower" @@ -505,6 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } +func BenchmarkBatchAddG2Affine(b *testing.B) { + var P, R [MAX_BATCH_SIZE]G2Affine + var RR [MAX_BATCH_SIZE]*G2Affine + var ridx [MAX_BATCH_SIZE]int + + fillBenchBasesG2(P[:]) + fillBenchBasesG2(R[:]) + + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } + + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + + for i, ri := range ridx { + RR[i] = &R[ri] + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + BatchAddG2Affine(RR[:], P[:]) + } + +} + func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled var mixer fr.Element diff --git a/ecc/bls24-317/g1_test.go b/ecc/bls24-317/g1_test.go index 2c08510b14..3673290566 100644 --- a/ecc/bls24-317/g1_test.go +++ b/ecc/bls24-317/g1_test.go @@ -19,6 +19,7 @@ package bls24317 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls24-317/fp" @@ -499,6 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } +func BenchmarkBatchAddG1Affine(b *testing.B) { + var P, R [MAX_BATCH_SIZE]G1Affine + var RR [MAX_BATCH_SIZE]*G1Affine + var ridx [MAX_BATCH_SIZE]int + + fillBenchBasesG1(P[:]) + fillBenchBasesG1(R[:]) + + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } + + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + + for i, ri := range ridx { + RR[i] = &R[ri] + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + BatchAddG1Affine(RR[:], P[:]) + } + +} + func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled var mixer fr.Element diff --git a/ecc/bls24-317/g2_test.go b/ecc/bls24-317/g2_test.go index 376b469347..74c8576f89 100644 --- a/ecc/bls24-317/g2_test.go +++ b/ecc/bls24-317/g2_test.go @@ -19,6 +19,7 @@ package bls24317 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls24-317/internal/fptower" @@ -505,6 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } +func BenchmarkBatchAddG2Affine(b *testing.B) { + var P, R [MAX_BATCH_SIZE]G2Affine + var RR [MAX_BATCH_SIZE]*G2Affine + var ridx [MAX_BATCH_SIZE]int + + fillBenchBasesG2(P[:]) + fillBenchBasesG2(R[:]) + + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } + + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + + for i, ri := range ridx { + RR[i] = &R[ri] + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + BatchAddG2Affine(RR[:], P[:]) + } + +} + func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled var mixer fr.Element diff --git a/ecc/bn254/g1_test.go b/ecc/bn254/g1_test.go index 8ee025d787..c87502be96 100644 --- a/ecc/bn254/g1_test.go +++ b/ecc/bn254/g1_test.go @@ -19,6 +19,7 @@ package bn254 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bn254/fp" @@ -460,6 +461,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } +func BenchmarkBatchAddG1Affine(b *testing.B) { + var P, R [MAX_BATCH_SIZE]G1Affine + var RR [MAX_BATCH_SIZE]*G1Affine + var ridx [MAX_BATCH_SIZE]int + + fillBenchBasesG1(P[:]) + fillBenchBasesG1(R[:]) + + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } + + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + + for i, ri := range ridx { + RR[i] = &R[ri] + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + BatchAddG1Affine(RR[:], P[:]) + } + +} + func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled var mixer fr.Element diff --git a/ecc/bn254/g2_test.go b/ecc/bn254/g2_test.go index 17c09d95ba..83d34fee91 100644 --- a/ecc/bn254/g2_test.go +++ b/ecc/bn254/g2_test.go @@ -19,6 +19,7 @@ package bn254 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bn254/internal/fptower" @@ -504,6 +505,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } +func BenchmarkBatchAddG2Affine(b *testing.B) { + var P, R [MAX_BATCH_SIZE]G2Affine + var RR [MAX_BATCH_SIZE]*G2Affine + var ridx [MAX_BATCH_SIZE]int + + fillBenchBasesG2(P[:]) + fillBenchBasesG2(R[:]) + + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } + + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + + for i, ri := range ridx { + RR[i] = &R[ri] + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + BatchAddG2Affine(RR[:], P[:]) + } + +} + func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled var mixer fr.Element diff --git a/ecc/bw6-633/g1_test.go b/ecc/bw6-633/g1_test.go index 6caf91227c..827cee65dd 100644 --- a/ecc/bw6-633/g1_test.go +++ b/ecc/bw6-633/g1_test.go @@ -19,6 +19,7 @@ package bw6633 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-633/fp" @@ -499,6 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } +func BenchmarkBatchAddG1Affine(b *testing.B) { + var P, R [MAX_BATCH_SIZE]G1Affine + var RR [MAX_BATCH_SIZE]*G1Affine + var ridx [MAX_BATCH_SIZE]int + + fillBenchBasesG1(P[:]) + fillBenchBasesG1(R[:]) + + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } + + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + + for i, ri := range ridx { + RR[i] = &R[ri] + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + BatchAddG1Affine(RR[:], P[:]) + } + +} + func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled var mixer fr.Element diff --git a/ecc/bw6-633/g2_test.go b/ecc/bw6-633/g2_test.go index 32773e9718..82ddc5385b 100644 --- a/ecc/bw6-633/g2_test.go +++ b/ecc/bw6-633/g2_test.go @@ -19,6 +19,7 @@ package bw6633 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-633/fp" @@ -486,6 +487,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } +func BenchmarkBatchAddG2Affine(b *testing.B) { + var P, R [MAX_BATCH_SIZE]G2Affine + var RR [MAX_BATCH_SIZE]*G2Affine + var ridx [MAX_BATCH_SIZE]int + + fillBenchBasesG2(P[:]) + fillBenchBasesG2(R[:]) + + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } + + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + + for i, ri := range ridx { + RR[i] = &R[ri] + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + BatchAddG2Affine(RR[:], P[:]) + } + +} + func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled var mixer fr.Element diff --git a/ecc/bw6-756/g1_test.go b/ecc/bw6-756/g1_test.go index 81ecf81553..fc64f7646c 100644 --- a/ecc/bw6-756/g1_test.go +++ b/ecc/bw6-756/g1_test.go @@ -19,6 +19,7 @@ package bw6756 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-756/fp" @@ -499,6 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } +func BenchmarkBatchAddG1Affine(b *testing.B) { + var P, R [MAX_BATCH_SIZE]G1Affine + var RR [MAX_BATCH_SIZE]*G1Affine + var ridx [MAX_BATCH_SIZE]int + + fillBenchBasesG1(P[:]) + fillBenchBasesG1(R[:]) + + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } + + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + + for i, ri := range ridx { + RR[i] = &R[ri] + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + BatchAddG1Affine(RR[:], P[:]) + } + +} + func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled var mixer fr.Element diff --git a/ecc/bw6-756/g2_test.go b/ecc/bw6-756/g2_test.go index ecfc973322..065dc4432e 100644 --- a/ecc/bw6-756/g2_test.go +++ b/ecc/bw6-756/g2_test.go @@ -19,6 +19,7 @@ package bw6756 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-756/fp" @@ -486,6 +487,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } +func BenchmarkBatchAddG2Affine(b *testing.B) { + var P, R [MAX_BATCH_SIZE]G2Affine + var RR [MAX_BATCH_SIZE]*G2Affine + var ridx [MAX_BATCH_SIZE]int + + fillBenchBasesG2(P[:]) + fillBenchBasesG2(R[:]) + + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } + + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + + for i, ri := range ridx { + RR[i] = &R[ri] + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + BatchAddG2Affine(RR[:], P[:]) + } + +} + func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled var mixer fr.Element diff --git a/ecc/bw6-761/g1_test.go b/ecc/bw6-761/g1_test.go index 6ace718ac2..3be460742f 100644 --- a/ecc/bw6-761/g1_test.go +++ b/ecc/bw6-761/g1_test.go @@ -19,6 +19,7 @@ package bw6761 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-761/fp" @@ -499,6 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } +func BenchmarkBatchAddG1Affine(b *testing.B) { + var P, R [MAX_BATCH_SIZE]G1Affine + var RR [MAX_BATCH_SIZE]*G1Affine + var ridx [MAX_BATCH_SIZE]int + + fillBenchBasesG1(P[:]) + fillBenchBasesG1(R[:]) + + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } + + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + + for i, ri := range ridx { + RR[i] = &R[ri] + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + BatchAddG1Affine(RR[:], P[:]) + } + +} + func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled var mixer fr.Element diff --git a/ecc/bw6-761/g2_test.go b/ecc/bw6-761/g2_test.go index 9630dbf178..0268875661 100644 --- a/ecc/bw6-761/g2_test.go +++ b/ecc/bw6-761/g2_test.go @@ -19,6 +19,7 @@ package bw6761 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-761/fp" @@ -486,6 +487,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } +func BenchmarkBatchAddG2Affine(b *testing.B) { + var P, R [MAX_BATCH_SIZE]G2Affine + var RR [MAX_BATCH_SIZE]*G2Affine + var ridx [MAX_BATCH_SIZE]int + + fillBenchBasesG2(P[:]) + fillBenchBasesG2(R[:]) + + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } + + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + + for i, ri := range ridx { + RR[i] = &R[ri] + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + BatchAddG2Affine(RR[:], P[:]) + } + +} + func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled var mixer fr.Element diff --git a/internal/generator/ecc/template/tests/point.go.tmpl b/internal/generator/ecc/template/tests/point.go.tmpl index 556d9befc3..e033f96dd7 100644 --- a/internal/generator/ecc/template/tests/point.go.tmpl +++ b/internal/generator/ecc/template/tests/point.go.tmpl @@ -16,6 +16,7 @@ import ( "fmt" "math/big" "testing" + "math/rand" {{if or (eq .CoordType "fptower.E2") (eq .CoordType "fptower.E4")}} "github.com/consensys/gnark-crypto/ecc/{{.Name}}/internal/fptower" @@ -559,6 +560,32 @@ func Benchmark{{ $TJacobian }}IsInSubGroup(b *testing.B) { } +func BenchmarkBatchAdd{{ $TAffine }}(b *testing.B) { + var P, R [MAX_BATCH_SIZE]{{ $TAffine }} + var RR [MAX_BATCH_SIZE]*{{ $TAffine }} + var ridx [MAX_BATCH_SIZE]int + + fillBenchBases{{ toUpper $.PointName }}(P[:]) + fillBenchBases{{ toUpper $.PointName }}(R[:]) + + for i:=0; i < len(ridx);i++ { + ridx[i] = i + } + + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + + for i, ri := range ridx { + RR[i] = &R[ri] + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + BatchAdd{{ $TAffine }}(RR[:], P[:]) + } + +} + func Benchmark{{ $TAffine }}BatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled var mixer fr.Element From f4b4eea58c4e039931694038ffc2a49fd54e95ba Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Wed, 9 Nov 2022 15:22:08 -0600 Subject: [PATCH 12/43] feat: add bitset to do quick bucket presence check in batch --- ecc/bls12-377/multiexp.go | 28 +++--- ecc/bls12-377/multiexp_affine.go | 86 ++++++++++++------- ecc/bls12-378/multiexp.go | 28 +++--- ecc/bls12-378/multiexp_affine.go | 86 ++++++++++++------- ecc/bls12-381/multiexp.go | 28 +++--- ecc/bls12-381/multiexp_affine.go | 86 ++++++++++++------- ecc/bls24-315/multiexp.go | 28 +++--- ecc/bls24-315/multiexp_affine.go | 86 ++++++++++++------- ecc/bls24-317/multiexp.go | 28 +++--- ecc/bls24-317/multiexp_affine.go | 86 ++++++++++++------- ecc/bn254/multiexp.go | 28 +++--- ecc/bn254/multiexp_affine.go | 86 ++++++++++++------- ecc/bw6-633/multiexp.go | 4 +- ecc/bw6-633/multiexp_affine.go | 68 ++++++++------- ecc/bw6-756/multiexp.go | 4 +- ecc/bw6-756/multiexp_affine.go | 68 ++++++++------- ecc/bw6-761/multiexp.go | 4 +- ecc/bw6-761/multiexp_affine.go | 68 ++++++++------- .../generator/ecc/template/multiexp.go.tmpl | 4 +- .../ecc/template/multiexp_affine.go.tmpl | 36 ++++---- 20 files changed, 545 insertions(+), 395 deletions(-) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index 98de6ca242..24e4fe5ee0 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -172,31 +172,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10] + processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11] + processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12] + processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13] + processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14] + processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15] + processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -418,31 +418,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10] + processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11] + processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12] + processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13] + processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14] + processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15] + processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index 66c9d4e8bf..33cbf3844b 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -32,7 +32,7 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, +func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -45,6 +45,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; + // we do that instead of a separate object to give enough hints to the compiler to.. + // keep things on the stack. batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -52,15 +54,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, batchSize) - cptP := 0 // count the number of point added to current batch + var bucketIds BS // bitSet to signify presence of a bucket in current batch + cptP := 0 // count the number of point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack - var R [MAX_BATCH_SIZE]*G1Affine // ... + var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) + var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { - _, ok := bucketIds[bID] - return !ok + return !bucketIds[bID] } isFull := func() bool { @@ -72,9 +73,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, return } BatchAddG1Affine(R[:cptP], P[:cptP]) - for k := range bucketIds { - delete(bucketIds, k) - } + var tmp BS + bucketIds = tmp cptP = 0 } @@ -109,8 +109,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } } - // bucketIds[cptP] = op.bucketID - bucketIds[op.bucketID] = struct{}{} + bucketIds[op.bucketID] = true //struct{}{} R[cptP] = BK if op.isNeg() { P[cptP].Neg(PP) @@ -120,7 +119,6 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. var queue [MAX_BATCH_SIZE]batchOp qID := 0 @@ -183,9 +181,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, // queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", - // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) - // executeAndReset() + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. @@ -249,7 +245,7 @@ type ibG1Affine interface { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, +func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -262,6 +258,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; + // we do that instead of a separate object to give enough hints to the compiler to.. + // keep things on the stack. batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -269,15 +267,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, batchSize) - cptP := 0 // count the number of point added to current batch + var bucketIds BS // bitSet to signify presence of a bucket in current batch + cptP := 0 // count the number of point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack - var R [MAX_BATCH_SIZE]*G2Affine // ... + var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) + var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { - _, ok := bucketIds[bID] - return !ok + return !bucketIds[bID] } isFull := func() bool { @@ -289,9 +286,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, return } BatchAddG2Affine(R[:cptP], P[:cptP]) - for k := range bucketIds { - delete(bucketIds, k) - } + var tmp BS + bucketIds = tmp cptP = 0 } @@ -326,8 +322,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } } - // bucketIds[cptP] = op.bucketID - bucketIds[op.bucketID] = struct{}{} + bucketIds[op.bucketID] = true //struct{}{} R[cptP] = BK if op.isNeg() { P[cptP].Neg(PP) @@ -337,7 +332,6 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. var queue [MAX_BATCH_SIZE]batchOp qID := 0 @@ -400,9 +394,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, // queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", - // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) - // executeAndReset() + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. @@ -459,3 +451,33 @@ type ibG2Affine interface { bucketG2AffineC15 | bucketG2AffineC16 } + +type bitSetC4 [1 << (4 - 1)]bool +type bitSetC5 [1 << (5 - 1)]bool +type bitSetC6 [1 << (6 - 1)]bool +type bitSetC7 [1 << (7 - 1)]bool +type bitSetC8 [1 << (8 - 1)]bool +type bitSetC9 [1 << (9 - 1)]bool +type bitSetC10 [1 << (10 - 1)]bool +type bitSetC11 [1 << (11 - 1)]bool +type bitSetC12 [1 << (12 - 1)]bool +type bitSetC13 [1 << (13 - 1)]bool +type bitSetC14 [1 << (14 - 1)]bool +type bitSetC15 [1 << (15 - 1)]bool +type bitSetC16 [1 << (16 - 1)]bool + +type bitSet interface { + bitSetC4 | + bitSetC5 | + bitSetC6 | + bitSetC7 | + bitSetC8 | + bitSetC9 | + bitSetC10 | + bitSetC11 | + bitSetC12 | + bitSetC13 | + bitSetC14 | + bitSetC15 | + bitSetC16 +} diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index 917f493796..3f79b12596 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -172,31 +172,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10] + processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11] + processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12] + processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13] + processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14] + processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15] + processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -418,31 +418,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10] + processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11] + processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12] + processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13] + processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14] + processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15] + processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index a48d9d1cfd..b631d13a72 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -32,7 +32,7 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, +func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -45,6 +45,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; + // we do that instead of a separate object to give enough hints to the compiler to.. + // keep things on the stack. batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -52,15 +54,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, batchSize) - cptP := 0 // count the number of point added to current batch + var bucketIds BS // bitSet to signify presence of a bucket in current batch + cptP := 0 // count the number of point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack - var R [MAX_BATCH_SIZE]*G1Affine // ... + var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) + var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { - _, ok := bucketIds[bID] - return !ok + return !bucketIds[bID] } isFull := func() bool { @@ -72,9 +73,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, return } BatchAddG1Affine(R[:cptP], P[:cptP]) - for k := range bucketIds { - delete(bucketIds, k) - } + var tmp BS + bucketIds = tmp cptP = 0 } @@ -109,8 +109,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } } - // bucketIds[cptP] = op.bucketID - bucketIds[op.bucketID] = struct{}{} + bucketIds[op.bucketID] = true //struct{}{} R[cptP] = BK if op.isNeg() { P[cptP].Neg(PP) @@ -120,7 +119,6 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. var queue [MAX_BATCH_SIZE]batchOp qID := 0 @@ -183,9 +181,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, // queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", - // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) - // executeAndReset() + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. @@ -249,7 +245,7 @@ type ibG1Affine interface { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, +func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -262,6 +258,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; + // we do that instead of a separate object to give enough hints to the compiler to.. + // keep things on the stack. batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -269,15 +267,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, batchSize) - cptP := 0 // count the number of point added to current batch + var bucketIds BS // bitSet to signify presence of a bucket in current batch + cptP := 0 // count the number of point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack - var R [MAX_BATCH_SIZE]*G2Affine // ... + var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) + var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { - _, ok := bucketIds[bID] - return !ok + return !bucketIds[bID] } isFull := func() bool { @@ -289,9 +286,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, return } BatchAddG2Affine(R[:cptP], P[:cptP]) - for k := range bucketIds { - delete(bucketIds, k) - } + var tmp BS + bucketIds = tmp cptP = 0 } @@ -326,8 +322,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } } - // bucketIds[cptP] = op.bucketID - bucketIds[op.bucketID] = struct{}{} + bucketIds[op.bucketID] = true //struct{}{} R[cptP] = BK if op.isNeg() { P[cptP].Neg(PP) @@ -337,7 +332,6 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. var queue [MAX_BATCH_SIZE]batchOp qID := 0 @@ -400,9 +394,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, // queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", - // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) - // executeAndReset() + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. @@ -459,3 +451,33 @@ type ibG2Affine interface { bucketG2AffineC15 | bucketG2AffineC16 } + +type bitSetC4 [1 << (4 - 1)]bool +type bitSetC5 [1 << (5 - 1)]bool +type bitSetC6 [1 << (6 - 1)]bool +type bitSetC7 [1 << (7 - 1)]bool +type bitSetC8 [1 << (8 - 1)]bool +type bitSetC9 [1 << (9 - 1)]bool +type bitSetC10 [1 << (10 - 1)]bool +type bitSetC11 [1 << (11 - 1)]bool +type bitSetC12 [1 << (12 - 1)]bool +type bitSetC13 [1 << (13 - 1)]bool +type bitSetC14 [1 << (14 - 1)]bool +type bitSetC15 [1 << (15 - 1)]bool +type bitSetC16 [1 << (16 - 1)]bool + +type bitSet interface { + bitSetC4 | + bitSetC5 | + bitSetC6 | + bitSetC7 | + bitSetC8 | + bitSetC9 | + bitSetC10 | + bitSetC11 | + bitSetC12 | + bitSetC13 | + bitSetC14 | + bitSetC15 | + bitSetC16 +} diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index 8283ce4957..5bd22872c9 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -172,31 +172,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10] + processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11] + processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12] + processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13] + processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14] + processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15] + processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -418,31 +418,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10] + processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11] + processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12] + processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13] + processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14] + processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15] + processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index d30c10293e..bf65dc9aa1 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -32,7 +32,7 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, +func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -45,6 +45,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; + // we do that instead of a separate object to give enough hints to the compiler to.. + // keep things on the stack. batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -52,15 +54,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, batchSize) - cptP := 0 // count the number of point added to current batch + var bucketIds BS // bitSet to signify presence of a bucket in current batch + cptP := 0 // count the number of point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack - var R [MAX_BATCH_SIZE]*G1Affine // ... + var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) + var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { - _, ok := bucketIds[bID] - return !ok + return !bucketIds[bID] } isFull := func() bool { @@ -72,9 +73,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, return } BatchAddG1Affine(R[:cptP], P[:cptP]) - for k := range bucketIds { - delete(bucketIds, k) - } + var tmp BS + bucketIds = tmp cptP = 0 } @@ -109,8 +109,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } } - // bucketIds[cptP] = op.bucketID - bucketIds[op.bucketID] = struct{}{} + bucketIds[op.bucketID] = true //struct{}{} R[cptP] = BK if op.isNeg() { P[cptP].Neg(PP) @@ -120,7 +119,6 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. var queue [MAX_BATCH_SIZE]batchOp qID := 0 @@ -183,9 +181,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, // queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", - // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) - // executeAndReset() + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. @@ -249,7 +245,7 @@ type ibG1Affine interface { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, +func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -262,6 +258,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; + // we do that instead of a separate object to give enough hints to the compiler to.. + // keep things on the stack. batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -269,15 +267,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, batchSize) - cptP := 0 // count the number of point added to current batch + var bucketIds BS // bitSet to signify presence of a bucket in current batch + cptP := 0 // count the number of point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack - var R [MAX_BATCH_SIZE]*G2Affine // ... + var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) + var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { - _, ok := bucketIds[bID] - return !ok + return !bucketIds[bID] } isFull := func() bool { @@ -289,9 +286,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, return } BatchAddG2Affine(R[:cptP], P[:cptP]) - for k := range bucketIds { - delete(bucketIds, k) - } + var tmp BS + bucketIds = tmp cptP = 0 } @@ -326,8 +322,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } } - // bucketIds[cptP] = op.bucketID - bucketIds[op.bucketID] = struct{}{} + bucketIds[op.bucketID] = true //struct{}{} R[cptP] = BK if op.isNeg() { P[cptP].Neg(PP) @@ -337,7 +332,6 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. var queue [MAX_BATCH_SIZE]batchOp qID := 0 @@ -400,9 +394,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, // queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", - // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) - // executeAndReset() + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. @@ -459,3 +451,33 @@ type ibG2Affine interface { bucketG2AffineC15 | bucketG2AffineC16 } + +type bitSetC4 [1 << (4 - 1)]bool +type bitSetC5 [1 << (5 - 1)]bool +type bitSetC6 [1 << (6 - 1)]bool +type bitSetC7 [1 << (7 - 1)]bool +type bitSetC8 [1 << (8 - 1)]bool +type bitSetC9 [1 << (9 - 1)]bool +type bitSetC10 [1 << (10 - 1)]bool +type bitSetC11 [1 << (11 - 1)]bool +type bitSetC12 [1 << (12 - 1)]bool +type bitSetC13 [1 << (13 - 1)]bool +type bitSetC14 [1 << (14 - 1)]bool +type bitSetC15 [1 << (15 - 1)]bool +type bitSetC16 [1 << (16 - 1)]bool + +type bitSet interface { + bitSetC4 | + bitSetC5 | + bitSetC6 | + bitSetC7 | + bitSetC8 | + bitSetC9 | + bitSetC10 | + bitSetC11 | + bitSetC12 | + bitSetC13 | + bitSetC14 | + bitSetC15 | + bitSetC16 +} diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index 922c80cd89..81dc28638c 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -172,31 +172,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10] + processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11] + processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12] + processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13] + processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14] + processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15] + processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -418,31 +418,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10] + processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11] + processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12] + processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13] + processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14] + processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15] + processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index c7aa56e2d5..50a7c5613a 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -32,7 +32,7 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, +func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -45,6 +45,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; + // we do that instead of a separate object to give enough hints to the compiler to.. + // keep things on the stack. batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -52,15 +54,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, batchSize) - cptP := 0 // count the number of point added to current batch + var bucketIds BS // bitSet to signify presence of a bucket in current batch + cptP := 0 // count the number of point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack - var R [MAX_BATCH_SIZE]*G1Affine // ... + var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) + var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { - _, ok := bucketIds[bID] - return !ok + return !bucketIds[bID] } isFull := func() bool { @@ -72,9 +73,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, return } BatchAddG1Affine(R[:cptP], P[:cptP]) - for k := range bucketIds { - delete(bucketIds, k) - } + var tmp BS + bucketIds = tmp cptP = 0 } @@ -109,8 +109,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } } - // bucketIds[cptP] = op.bucketID - bucketIds[op.bucketID] = struct{}{} + bucketIds[op.bucketID] = true //struct{}{} R[cptP] = BK if op.isNeg() { P[cptP].Neg(PP) @@ -120,7 +119,6 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. var queue [MAX_BATCH_SIZE]batchOp qID := 0 @@ -183,9 +181,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, // queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", - // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) - // executeAndReset() + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. @@ -249,7 +245,7 @@ type ibG1Affine interface { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, +func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -262,6 +258,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; + // we do that instead of a separate object to give enough hints to the compiler to.. + // keep things on the stack. batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -269,15 +267,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, batchSize) - cptP := 0 // count the number of point added to current batch + var bucketIds BS // bitSet to signify presence of a bucket in current batch + cptP := 0 // count the number of point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack - var R [MAX_BATCH_SIZE]*G2Affine // ... + var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) + var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { - _, ok := bucketIds[bID] - return !ok + return !bucketIds[bID] } isFull := func() bool { @@ -289,9 +286,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, return } BatchAddG2Affine(R[:cptP], P[:cptP]) - for k := range bucketIds { - delete(bucketIds, k) - } + var tmp BS + bucketIds = tmp cptP = 0 } @@ -326,8 +322,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } } - // bucketIds[cptP] = op.bucketID - bucketIds[op.bucketID] = struct{}{} + bucketIds[op.bucketID] = true //struct{}{} R[cptP] = BK if op.isNeg() { P[cptP].Neg(PP) @@ -337,7 +332,6 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. var queue [MAX_BATCH_SIZE]batchOp qID := 0 @@ -400,9 +394,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, // queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", - // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) - // executeAndReset() + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. @@ -459,3 +451,33 @@ type ibG2Affine interface { bucketG2AffineC15 | bucketG2AffineC16 } + +type bitSetC4 [1 << (4 - 1)]bool +type bitSetC5 [1 << (5 - 1)]bool +type bitSetC6 [1 << (6 - 1)]bool +type bitSetC7 [1 << (7 - 1)]bool +type bitSetC8 [1 << (8 - 1)]bool +type bitSetC9 [1 << (9 - 1)]bool +type bitSetC10 [1 << (10 - 1)]bool +type bitSetC11 [1 << (11 - 1)]bool +type bitSetC12 [1 << (12 - 1)]bool +type bitSetC13 [1 << (13 - 1)]bool +type bitSetC14 [1 << (14 - 1)]bool +type bitSetC15 [1 << (15 - 1)]bool +type bitSetC16 [1 << (16 - 1)]bool + +type bitSet interface { + bitSetC4 | + bitSetC5 | + bitSetC6 | + bitSetC7 | + bitSetC8 | + bitSetC9 | + bitSetC10 | + bitSetC11 | + bitSetC12 | + bitSetC13 | + bitSetC14 | + bitSetC15 | + bitSetC16 +} diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index 923946e34f..56342df6d4 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -172,31 +172,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10] + processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11] + processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12] + processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13] + processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14] + processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15] + processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -418,31 +418,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10] + processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11] + processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12] + processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13] + processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14] + processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15] + processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index ccd70a9474..82ec92b92f 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -32,7 +32,7 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, +func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -45,6 +45,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; + // we do that instead of a separate object to give enough hints to the compiler to.. + // keep things on the stack. batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -52,15 +54,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, batchSize) - cptP := 0 // count the number of point added to current batch + var bucketIds BS // bitSet to signify presence of a bucket in current batch + cptP := 0 // count the number of point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack - var R [MAX_BATCH_SIZE]*G1Affine // ... + var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) + var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { - _, ok := bucketIds[bID] - return !ok + return !bucketIds[bID] } isFull := func() bool { @@ -72,9 +73,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, return } BatchAddG1Affine(R[:cptP], P[:cptP]) - for k := range bucketIds { - delete(bucketIds, k) - } + var tmp BS + bucketIds = tmp cptP = 0 } @@ -109,8 +109,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } } - // bucketIds[cptP] = op.bucketID - bucketIds[op.bucketID] = struct{}{} + bucketIds[op.bucketID] = true //struct{}{} R[cptP] = BK if op.isNeg() { P[cptP].Neg(PP) @@ -120,7 +119,6 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. var queue [MAX_BATCH_SIZE]batchOp qID := 0 @@ -183,9 +181,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, // queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", - // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) - // executeAndReset() + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. @@ -249,7 +245,7 @@ type ibG1Affine interface { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, +func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -262,6 +258,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; + // we do that instead of a separate object to give enough hints to the compiler to.. + // keep things on the stack. batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -269,15 +267,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, batchSize) - cptP := 0 // count the number of point added to current batch + var bucketIds BS // bitSet to signify presence of a bucket in current batch + cptP := 0 // count the number of point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack - var R [MAX_BATCH_SIZE]*G2Affine // ... + var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) + var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { - _, ok := bucketIds[bID] - return !ok + return !bucketIds[bID] } isFull := func() bool { @@ -289,9 +286,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, return } BatchAddG2Affine(R[:cptP], P[:cptP]) - for k := range bucketIds { - delete(bucketIds, k) - } + var tmp BS + bucketIds = tmp cptP = 0 } @@ -326,8 +322,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } } - // bucketIds[cptP] = op.bucketID - bucketIds[op.bucketID] = struct{}{} + bucketIds[op.bucketID] = true //struct{}{} R[cptP] = BK if op.isNeg() { P[cptP].Neg(PP) @@ -337,7 +332,6 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. var queue [MAX_BATCH_SIZE]batchOp qID := 0 @@ -400,9 +394,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, // queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", - // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) - // executeAndReset() + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. @@ -459,3 +451,33 @@ type ibG2Affine interface { bucketG2AffineC15 | bucketG2AffineC16 } + +type bitSetC4 [1 << (4 - 1)]bool +type bitSetC5 [1 << (5 - 1)]bool +type bitSetC6 [1 << (6 - 1)]bool +type bitSetC7 [1 << (7 - 1)]bool +type bitSetC8 [1 << (8 - 1)]bool +type bitSetC9 [1 << (9 - 1)]bool +type bitSetC10 [1 << (10 - 1)]bool +type bitSetC11 [1 << (11 - 1)]bool +type bitSetC12 [1 << (12 - 1)]bool +type bitSetC13 [1 << (13 - 1)]bool +type bitSetC14 [1 << (14 - 1)]bool +type bitSetC15 [1 << (15 - 1)]bool +type bitSetC16 [1 << (16 - 1)]bool + +type bitSet interface { + bitSetC4 | + bitSetC5 | + bitSetC6 | + bitSetC7 | + bitSetC8 | + bitSetC9 | + bitSetC10 | + bitSetC11 | + bitSetC12 | + bitSetC13 | + bitSetC14 | + bitSetC15 | + bitSetC16 +} diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index 410e4016ab..64b2883493 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -172,31 +172,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10] + processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11] + processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12] + processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13] + processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14] + processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15] + processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -418,31 +418,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10] + processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11] + processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12] + processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13] + processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14] + processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15] + processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index 62c70e876a..b750572b22 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -32,7 +32,7 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, +func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -45,6 +45,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; + // we do that instead of a separate object to give enough hints to the compiler to.. + // keep things on the stack. batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -52,15 +54,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, batchSize) - cptP := 0 // count the number of point added to current batch + var bucketIds BS // bitSet to signify presence of a bucket in current batch + cptP := 0 // count the number of point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack - var R [MAX_BATCH_SIZE]*G1Affine // ... + var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) + var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { - _, ok := bucketIds[bID] - return !ok + return !bucketIds[bID] } isFull := func() bool { @@ -72,9 +73,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, return } BatchAddG1Affine(R[:cptP], P[:cptP]) - for k := range bucketIds { - delete(bucketIds, k) - } + var tmp BS + bucketIds = tmp cptP = 0 } @@ -109,8 +109,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } } - // bucketIds[cptP] = op.bucketID - bucketIds[op.bucketID] = struct{}{} + bucketIds[op.bucketID] = true //struct{}{} R[cptP] = BK if op.isNeg() { P[cptP].Neg(PP) @@ -120,7 +119,6 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. var queue [MAX_BATCH_SIZE]batchOp qID := 0 @@ -183,9 +181,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, // queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", - // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) - // executeAndReset() + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. @@ -249,7 +245,7 @@ type ibG1Affine interface { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, +func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -262,6 +258,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; + // we do that instead of a separate object to give enough hints to the compiler to.. + // keep things on the stack. batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -269,15 +267,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, batchSize) - cptP := 0 // count the number of point added to current batch + var bucketIds BS // bitSet to signify presence of a bucket in current batch + cptP := 0 // count the number of point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack - var R [MAX_BATCH_SIZE]*G2Affine // ... + var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) + var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { - _, ok := bucketIds[bID] - return !ok + return !bucketIds[bID] } isFull := func() bool { @@ -289,9 +286,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, return } BatchAddG2Affine(R[:cptP], P[:cptP]) - for k := range bucketIds { - delete(bucketIds, k) - } + var tmp BS + bucketIds = tmp cptP = 0 } @@ -326,8 +322,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } } - // bucketIds[cptP] = op.bucketID - bucketIds[op.bucketID] = struct{}{} + bucketIds[op.bucketID] = true //struct{}{} R[cptP] = BK if op.isNeg() { P[cptP].Neg(PP) @@ -337,7 +332,6 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. var queue [MAX_BATCH_SIZE]batchOp qID := 0 @@ -400,9 +394,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, // queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", - // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) - // executeAndReset() + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. @@ -459,3 +451,33 @@ type ibG2Affine interface { bucketG2AffineC15 | bucketG2AffineC16 } + +type bitSetC4 [1 << (4 - 1)]bool +type bitSetC5 [1 << (5 - 1)]bool +type bitSetC6 [1 << (6 - 1)]bool +type bitSetC7 [1 << (7 - 1)]bool +type bitSetC8 [1 << (8 - 1)]bool +type bitSetC9 [1 << (9 - 1)]bool +type bitSetC10 [1 << (10 - 1)]bool +type bitSetC11 [1 << (11 - 1)]bool +type bitSetC12 [1 << (12 - 1)]bool +type bitSetC13 [1 << (13 - 1)]bool +type bitSetC14 [1 << (14 - 1)]bool +type bitSetC15 [1 << (15 - 1)]bool +type bitSetC16 [1 << (16 - 1)]bool + +type bitSet interface { + bitSetC4 | + bitSetC5 | + bitSetC6 | + bitSetC7 | + bitSetC8 | + bitSetC9 | + bitSetC10 | + bitSetC11 | + bitSetC12 | + bitSetC13 | + bitSetC14 | + bitSetC15 | + bitSetC16 +} diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index b8fdb1314f..762cecb6c9 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -159,7 +159,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -368,7 +368,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index 1a7d1b4abe..12762830f9 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -32,7 +32,7 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, +func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -45,6 +45,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; + // we do that instead of a separate object to give enough hints to the compiler to.. + // keep things on the stack. batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -52,15 +54,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, batchSize) - cptP := 0 // count the number of point added to current batch + var bucketIds BS // bitSet to signify presence of a bucket in current batch + cptP := 0 // count the number of point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack - var R [MAX_BATCH_SIZE]*G1Affine // ... + var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) + var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { - _, ok := bucketIds[bID] - return !ok + return !bucketIds[bID] } isFull := func() bool { @@ -72,9 +73,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, return } BatchAddG1Affine(R[:cptP], P[:cptP]) - for k := range bucketIds { - delete(bucketIds, k) - } + var tmp BS + bucketIds = tmp cptP = 0 } @@ -109,8 +109,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } } - // bucketIds[cptP] = op.bucketID - bucketIds[op.bucketID] = struct{}{} + bucketIds[op.bucketID] = true //struct{}{} R[cptP] = BK if op.isNeg() { P[cptP].Neg(PP) @@ -120,7 +119,6 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. var queue [MAX_BATCH_SIZE]batchOp qID := 0 @@ -183,9 +181,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, // queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", - // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) - // executeAndReset() + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. @@ -231,7 +227,7 @@ type ibG1Affine interface { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, +func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -244,6 +240,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; + // we do that instead of a separate object to give enough hints to the compiler to.. + // keep things on the stack. batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -251,15 +249,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, batchSize) - cptP := 0 // count the number of point added to current batch + var bucketIds BS // bitSet to signify presence of a bucket in current batch + cptP := 0 // count the number of point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack - var R [MAX_BATCH_SIZE]*G2Affine // ... + var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) + var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { - _, ok := bucketIds[bID] - return !ok + return !bucketIds[bID] } isFull := func() bool { @@ -271,9 +268,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, return } BatchAddG2Affine(R[:cptP], P[:cptP]) - for k := range bucketIds { - delete(bucketIds, k) - } + var tmp BS + bucketIds = tmp cptP = 0 } @@ -308,8 +304,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } } - // bucketIds[cptP] = op.bucketID - bucketIds[op.bucketID] = struct{}{} + bucketIds[op.bucketID] = true //struct{}{} R[cptP] = BK if op.isNeg() { P[cptP].Neg(PP) @@ -319,7 +314,6 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. var queue [MAX_BATCH_SIZE]batchOp qID := 0 @@ -382,9 +376,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, // queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", - // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) - // executeAndReset() + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. @@ -423,3 +415,15 @@ type ibG2Affine interface { bucketG2AffineC8 | bucketG2AffineC16 } + +type bitSetC4 [1 << (4 - 1)]bool +type bitSetC5 [1 << (5 - 1)]bool +type bitSetC8 [1 << (8 - 1)]bool +type bitSetC16 [1 << (16 - 1)]bool + +type bitSet interface { + bitSetC4 | + bitSetC5 | + bitSetC8 | + bitSetC16 +} diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index 98b3867477..649b86facf 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -160,7 +160,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -370,7 +370,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index 93b394e246..e4748e2c8b 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -32,7 +32,7 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, +func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -45,6 +45,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; + // we do that instead of a separate object to give enough hints to the compiler to.. + // keep things on the stack. batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -52,15 +54,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, batchSize) - cptP := 0 // count the number of point added to current batch + var bucketIds BS // bitSet to signify presence of a bucket in current batch + cptP := 0 // count the number of point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack - var R [MAX_BATCH_SIZE]*G1Affine // ... + var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) + var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { - _, ok := bucketIds[bID] - return !ok + return !bucketIds[bID] } isFull := func() bool { @@ -72,9 +73,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, return } BatchAddG1Affine(R[:cptP], P[:cptP]) - for k := range bucketIds { - delete(bucketIds, k) - } + var tmp BS + bucketIds = tmp cptP = 0 } @@ -109,8 +109,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } } - // bucketIds[cptP] = op.bucketID - bucketIds[op.bucketID] = struct{}{} + bucketIds[op.bucketID] = true //struct{}{} R[cptP] = BK if op.isNeg() { P[cptP].Neg(PP) @@ -120,7 +119,6 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. var queue [MAX_BATCH_SIZE]batchOp qID := 0 @@ -183,9 +181,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, // queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", - // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) - // executeAndReset() + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. @@ -231,7 +227,7 @@ type ibG1Affine interface { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, +func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -244,6 +240,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; + // we do that instead of a separate object to give enough hints to the compiler to.. + // keep things on the stack. batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -251,15 +249,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, batchSize) - cptP := 0 // count the number of point added to current batch + var bucketIds BS // bitSet to signify presence of a bucket in current batch + cptP := 0 // count the number of point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack - var R [MAX_BATCH_SIZE]*G2Affine // ... + var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) + var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { - _, ok := bucketIds[bID] - return !ok + return !bucketIds[bID] } isFull := func() bool { @@ -271,9 +268,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, return } BatchAddG2Affine(R[:cptP], P[:cptP]) - for k := range bucketIds { - delete(bucketIds, k) - } + var tmp BS + bucketIds = tmp cptP = 0 } @@ -308,8 +304,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } } - // bucketIds[cptP] = op.bucketID - bucketIds[op.bucketID] = struct{}{} + bucketIds[op.bucketID] = true //struct{}{} R[cptP] = BK if op.isNeg() { P[cptP].Neg(PP) @@ -319,7 +314,6 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. var queue [MAX_BATCH_SIZE]batchOp qID := 0 @@ -382,9 +376,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, // queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", - // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) - // executeAndReset() + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. @@ -423,3 +415,15 @@ type ibG2Affine interface { bucketG2AffineC8 | bucketG2AffineC16 } + +type bitSetC4 [1 << (4 - 1)]bool +type bitSetC5 [1 << (5 - 1)]bool +type bitSetC8 [1 << (8 - 1)]bool +type bitSetC16 [1 << (16 - 1)]bool + +type bitSet interface { + bitSetC4 | + bitSetC5 | + bitSetC8 | + bitSetC16 +} diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index 9a41a9176f..9eb52c0130 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -160,7 +160,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -370,7 +370,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index cdd2c92daf..167bdf2902 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -32,7 +32,7 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, +func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -45,6 +45,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } // setup for the batch affine; + // we do that instead of a separate object to give enough hints to the compiler to.. + // keep things on the stack. batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -52,15 +54,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, batchSize) - cptP := 0 // count the number of point added to current batch + var bucketIds BS // bitSet to signify presence of a bucket in current batch + cptP := 0 // count the number of point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // allocated on the stack - var R [MAX_BATCH_SIZE]*G1Affine // ... + var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) + var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { - _, ok := bucketIds[bID] - return !ok + return !bucketIds[bID] } isFull := func() bool { @@ -72,9 +73,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, return } BatchAddG1Affine(R[:cptP], P[:cptP]) - for k := range bucketIds { - delete(bucketIds, k) - } + var tmp BS + bucketIds = tmp cptP = 0 } @@ -109,8 +109,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, } } - // bucketIds[cptP] = op.bucketID - bucketIds[op.bucketID] = struct{}{} + bucketIds[op.bucketID] = true //struct{}{} R[cptP] = BK if op.isNeg() { P[cptP].Neg(PP) @@ -120,7 +119,6 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, cptP++ } - // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. var queue [MAX_BATCH_SIZE]batchOp qID := 0 @@ -183,9 +181,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64, // queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", - // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) - // executeAndReset() + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. @@ -231,7 +227,7 @@ type ibG1Affine interface { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, +func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -244,6 +240,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } // setup for the batch affine; + // we do that instead of a separate object to give enough hints to the compiler to.. + // keep things on the stack. batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -251,15 +249,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, batchSize) - cptP := 0 // count the number of point added to current batch + var bucketIds BS // bitSet to signify presence of a bucket in current batch + cptP := 0 // count the number of point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // allocated on the stack - var R [MAX_BATCH_SIZE]*G2Affine // ... + var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) + var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { - _, ok := bucketIds[bID] - return !ok + return !bucketIds[bID] } isFull := func() bool { @@ -271,9 +268,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, return } BatchAddG2Affine(R[:cptP], P[:cptP]) - for k := range bucketIds { - delete(bucketIds, k) - } + var tmp BS + bucketIds = tmp cptP = 0 } @@ -308,8 +304,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, } } - // bucketIds[cptP] = op.bucketID - bucketIds[op.bucketID] = struct{}{} + bucketIds[op.bucketID] = true //struct{}{} R[cptP] = BK if op.isNeg() { P[cptP].Neg(PP) @@ -319,7 +314,6 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, cptP++ } - // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. var queue [MAX_BATCH_SIZE]batchOp qID := 0 @@ -382,9 +376,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64, // queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", - // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) - // executeAndReset() + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. @@ -423,3 +415,15 @@ type ibG2Affine interface { bucketG2AffineC8 | bucketG2AffineC16 } + +type bitSetC4 [1 << (4 - 1)]bool +type bitSetC5 [1 << (5 - 1)]bool +type bitSetC8 [1 << (8 - 1)]bool +type bitSetC16 [1 << (16 - 1)]bool + +type bitSet interface { + bitSetC4 | + bitSetC5 | + bitSetC8 | + bitSetC16 +} diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index 6dca2eb861..9e5ce00b70 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -427,7 +427,7 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi {{- if le $c 9}} processChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] {{- else}} - processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}] + processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}] {{- end}} {{- if eq $c $lc}} _innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processChunk) @@ -435,7 +435,7 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi {{- if le $lc 9}} processLastChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{lastC $c}}] {{- else}} - processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}] + processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}, bitSetC{{$c}}] {{- end}} _innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processLastChunk) {{- end}} diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index 897401430f..16071451e5 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -33,7 +33,7 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64, +func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](chunk uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, @@ -46,6 +46,8 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64 } // setup for the batch affine; + // we do that instead of a separate object to give enough hints to the compiler to.. + // keep things on the stack. batchSize := len(buckets) / 20 if batchSize > MAX_BATCH_SIZE { batchSize = MAX_BATCH_SIZE @@ -53,15 +55,14 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64 if batchSize <= 0 { batchSize = 1 } - bucketIds := make(map[uint32]struct{}, batchSize) + var bucketIds BS // bitSet to signify presence of a bucket in current batch cptP := 0 // count the number of point added to current batch - var P [MAX_BATCH_SIZE]{{ $.TAffine }} // allocated on the stack - var R [MAX_BATCH_SIZE]*{{ $.TAffine }} // ... + var P [MAX_BATCH_SIZE]{{ $.TAffine }} // points to be added to R (buckets) + var R [MAX_BATCH_SIZE]*{{ $.TAffine }} // bucket references canAdd := func(bID uint32) bool { - _, ok := bucketIds[bID] - return !ok + return !bucketIds[bID] } isFull := func() bool { @@ -73,9 +74,8 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64 return } BatchAdd{{ $.TAffine }}(R[:cptP], P[:cptP]) - for k := range bucketIds { - delete(bucketIds, k) - } + var tmp BS + bucketIds = tmp cptP = 0 } @@ -110,8 +110,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64 } } - // bucketIds[cptP] = op.bucketID - bucketIds[op.bucketID] = struct{}{} + bucketIds[op.bucketID] = true //struct{}{} R[cptP] = BK if op.isNeg() { P[cptP].Neg(PP) @@ -122,7 +121,6 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64 } - // queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here. var queue [MAX_BATCH_SIZE]batchOp qID := 0 @@ -185,9 +183,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64 // queue = append(queue, op) } } - // fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n", - // chunk, len(queue), nbBatches, batchSize, len(buckets), len(points)) - // executeAndReset() + for qID != 0 { processQueue() executeAndReset() // execute batch even if not full. @@ -226,3 +222,13 @@ type ib{{ $.TAffine }} interface { } {{end }} + +{{- range $c := $.G1.CRange}} +type bitSetC{{$c}} [1<<({{$c}}-1)]bool +{{- end}} + +type bitSet interface { + {{- range $i, $c := $.G1.CRange}} + bitSetC{{$c}} {{- if not (last $i $.G1.CRange)}} | {{- end}} + {{- end}} +} From b75ae0976bdc864c5d459d2b10aa43f3fbd7363d Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Thu, 10 Nov 2022 15:52:33 -0600 Subject: [PATCH 13/43] feat: restored split msm logic --- ecc/bls12-377/multiexp.go | 128 +++++++++--------- ecc/bls12-377/multiexp_test.go | 108 ++------------- ecc/bls12-378/multiexp.go | 128 +++++++++--------- ecc/bls12-378/multiexp_test.go | 108 ++------------- ecc/bls12-381/multiexp.go | 128 +++++++++--------- ecc/bls12-381/multiexp_test.go | 108 ++------------- ecc/bls24-315/multiexp.go | 128 +++++++++--------- ecc/bls24-315/multiexp_test.go | 108 ++------------- ecc/bls24-317/multiexp.go | 128 +++++++++--------- ecc/bls24-317/multiexp_test.go | 108 ++------------- ecc/bn254/multiexp.go | 128 +++++++++--------- ecc/bn254/multiexp_test.go | 108 ++------------- ecc/bw6-633/multiexp.go | 128 +++++++++--------- ecc/bw6-633/multiexp_test.go | 108 ++------------- ecc/bw6-756/multiexp.go | 128 +++++++++--------- ecc/bw6-756/multiexp_test.go | 108 ++------------- ecc/bw6-761/multiexp.go | 128 +++++++++--------- ecc/bw6-761/multiexp_test.go | 108 ++------------- .../generator/ecc/template/multiexp.go.tmpl | 72 +++++----- .../ecc/template/tests/multiexp.go.tmpl | 57 +------- 20 files changed, 727 insertions(+), 1526 deletions(-) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index 24e4fe5ee0..3c7e9fa3e4 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -99,54 +99,56 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul C = c } } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } return C } - // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. - // nbSplits := 1 C := bestC(nbPoints) nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar if (fr.Limbs*64)%C != 0 { nbChunks++ } + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split + if config.NbTasks > 1 && nbChunks < config.NbTasks { + // before spliting, let's see if we endup with more tasks than thread; + cSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) + if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit++ + } + nbTasksPostSplit := nbChunksPostSplit * 2 + if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + // if postSplit we still have less tasks than available CPU + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + config.NbTasks /= 2 + var _p G1Jac + chDone := make(chan struct{}, 1) + go func() { + innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + } + + innerMsmG1(p, int(C), points, scalars, config) + + return p, nil +} + +func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) { // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG1(p, int(C), points, digits, splitFirstChunk) - // we have nbSplits intermediate results that we must sum together. - - // _p := make([]G1Jac, nbSplits - 1) - // chDone := make(chan int, nbSplits - 1) - // for i:=0; i < nbSplits-1; i++ { - // start := i * nbPoints - // end := start + nbPoints - // go func(start, end, i int) { - // innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - // chDone <- i - // }(start, end, i) - // } - - // innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - // for i:=0; i < nbSplits-1; i++ { - // done := <-chDone - // p.AddAssign(&_p[done]) - // } - // close(chDone) - return p, nil -} - -func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: @@ -345,54 +347,56 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul C = c } } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } return C } - // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. - // nbSplits := 1 C := bestC(nbPoints) nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar if (fr.Limbs*64)%C != 0 { nbChunks++ } + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split + if config.NbTasks > 1 && nbChunks < config.NbTasks { + // before spliting, let's see if we endup with more tasks than thread; + cSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) + if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit++ + } + nbTasksPostSplit := nbChunksPostSplit * 2 + if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + // if postSplit we still have less tasks than available CPU + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + config.NbTasks /= 2 + var _p G2Jac + chDone := make(chan struct{}, 1) + go func() { + innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + } + + innerMsmG2(p, int(C), points, scalars, config) + + return p, nil +} + +func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) { // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG2(p, int(C), points, digits, splitFirstChunk) - // we have nbSplits intermediate results that we must sum together. - - // _p := make([]G2Jac, nbSplits - 1) - // chDone := make(chan int, nbSplits - 1) - // for i:=0; i < nbSplits-1; i++ { - // start := i * nbPoints - // end := start + nbPoints - // go func(start, end, i int) { - // innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - // chDone <- i - // }(start, end, i) - // } - - // innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - // for i:=0; i < nbSplits-1; i++ { - // done := <-chDone - // p.AddAssign(&_p[done]) - // } - // close(chDone) - return p, nil -} - -func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index e9e7fd9c67..3da962b555 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -94,8 +94,7 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true) + innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -130,14 +129,9 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - results := make([]G1Jac, len(cRange)+1) + results := make([]G1Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true) - } + innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -171,14 +165,9 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - results := make([]G1Jac, len(cRange)+1) + results := make([]G1Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) - } + innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -190,39 +179,6 @@ func TestMultiExpG1(t *testing.T) { genScalar, )) - properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( - func(mixer fr.Element) bool { - // multi exp points - var samplePoints [nbSamples]G1Affine - var g G1Jac - g.Set(&g1Gen) - for i := 1; i <= nbSamples; i++ { - samplePoints[i-1].FromJacobian(&g) - g.AddAssign(&g1Gen) - } - // mixer ensures that all the words of a fpElement are set - var sampleScalars [nbSamples]fr.Element - - for i := 1; i <= nbSamples; i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() - } - - var result1, result2 G1Jac - for _, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&result1, int(c), samplePoints[:], scalars, false) - innerMsmG1(&result2, int(c), samplePoints[:], scalars, false) - if !result1.Equal(&result2) { - return false - } - } - return true - }, - genScalar, - )) - // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -420,8 +376,7 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true) + innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -454,14 +409,9 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - results := make([]G2Jac, len(cRange)+1) + results := make([]G2Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true) - } + innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -495,14 +445,9 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - results := make([]G2Jac, len(cRange)+1) + results := make([]G2Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) - } + innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -514,39 +459,6 @@ func TestMultiExpG2(t *testing.T) { genScalar, )) - properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( - func(mixer fr.Element) bool { - // multi exp points - var samplePoints [nbSamples]G2Affine - var g G2Jac - g.Set(&g2Gen) - for i := 1; i <= nbSamples; i++ { - samplePoints[i-1].FromJacobian(&g) - g.AddAssign(&g2Gen) - } - // mixer ensures that all the words of a fpElement are set - var sampleScalars [nbSamples]fr.Element - - for i := 1; i <= nbSamples; i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() - } - - var result1, result2 G2Jac - for _, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&result1, int(c), samplePoints[:], scalars, false) - innerMsmG2(&result2, int(c), samplePoints[:], scalars, false) - if !result1.Equal(&result2) { - return false - } - } - return true - }, - genScalar, - )) - // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index 3f79b12596..2371d32a5c 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -99,54 +99,56 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul C = c } } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } return C } - // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. - // nbSplits := 1 C := bestC(nbPoints) nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar if (fr.Limbs*64)%C != 0 { nbChunks++ } + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split + if config.NbTasks > 1 && nbChunks < config.NbTasks { + // before spliting, let's see if we endup with more tasks than thread; + cSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) + if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit++ + } + nbTasksPostSplit := nbChunksPostSplit * 2 + if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + // if postSplit we still have less tasks than available CPU + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + config.NbTasks /= 2 + var _p G1Jac + chDone := make(chan struct{}, 1) + go func() { + innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + } + + innerMsmG1(p, int(C), points, scalars, config) + + return p, nil +} + +func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) { // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG1(p, int(C), points, digits, splitFirstChunk) - // we have nbSplits intermediate results that we must sum together. - - // _p := make([]G1Jac, nbSplits - 1) - // chDone := make(chan int, nbSplits - 1) - // for i:=0; i < nbSplits-1; i++ { - // start := i * nbPoints - // end := start + nbPoints - // go func(start, end, i int) { - // innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - // chDone <- i - // }(start, end, i) - // } - - // innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - // for i:=0; i < nbSplits-1; i++ { - // done := <-chDone - // p.AddAssign(&_p[done]) - // } - // close(chDone) - return p, nil -} - -func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: @@ -345,54 +347,56 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul C = c } } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } return C } - // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. - // nbSplits := 1 C := bestC(nbPoints) nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar if (fr.Limbs*64)%C != 0 { nbChunks++ } + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split + if config.NbTasks > 1 && nbChunks < config.NbTasks { + // before spliting, let's see if we endup with more tasks than thread; + cSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) + if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit++ + } + nbTasksPostSplit := nbChunksPostSplit * 2 + if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + // if postSplit we still have less tasks than available CPU + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + config.NbTasks /= 2 + var _p G2Jac + chDone := make(chan struct{}, 1) + go func() { + innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + } + + innerMsmG2(p, int(C), points, scalars, config) + + return p, nil +} + +func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) { // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG2(p, int(C), points, digits, splitFirstChunk) - // we have nbSplits intermediate results that we must sum together. - - // _p := make([]G2Jac, nbSplits - 1) - // chDone := make(chan int, nbSplits - 1) - // for i:=0; i < nbSplits-1; i++ { - // start := i * nbPoints - // end := start + nbPoints - // go func(start, end, i int) { - // innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - // chDone <- i - // }(start, end, i) - // } - - // innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - // for i:=0; i < nbSplits-1; i++ { - // done := <-chDone - // p.AddAssign(&_p[done]) - // } - // close(chDone) - return p, nil -} - -func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index 65729bd9c1..1cf20b793f 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -94,8 +94,7 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true) + innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -130,14 +129,9 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - results := make([]G1Jac, len(cRange)+1) + results := make([]G1Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true) - } + innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -171,14 +165,9 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - results := make([]G1Jac, len(cRange)+1) + results := make([]G1Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) - } + innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -190,39 +179,6 @@ func TestMultiExpG1(t *testing.T) { genScalar, )) - properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( - func(mixer fr.Element) bool { - // multi exp points - var samplePoints [nbSamples]G1Affine - var g G1Jac - g.Set(&g1Gen) - for i := 1; i <= nbSamples; i++ { - samplePoints[i-1].FromJacobian(&g) - g.AddAssign(&g1Gen) - } - // mixer ensures that all the words of a fpElement are set - var sampleScalars [nbSamples]fr.Element - - for i := 1; i <= nbSamples; i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() - } - - var result1, result2 G1Jac - for _, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&result1, int(c), samplePoints[:], scalars, false) - innerMsmG1(&result2, int(c), samplePoints[:], scalars, false) - if !result1.Equal(&result2) { - return false - } - } - return true - }, - genScalar, - )) - // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -420,8 +376,7 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true) + innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -454,14 +409,9 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - results := make([]G2Jac, len(cRange)+1) + results := make([]G2Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true) - } + innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -495,14 +445,9 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - results := make([]G2Jac, len(cRange)+1) + results := make([]G2Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) - } + innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -514,39 +459,6 @@ func TestMultiExpG2(t *testing.T) { genScalar, )) - properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( - func(mixer fr.Element) bool { - // multi exp points - var samplePoints [nbSamples]G2Affine - var g G2Jac - g.Set(&g2Gen) - for i := 1; i <= nbSamples; i++ { - samplePoints[i-1].FromJacobian(&g) - g.AddAssign(&g2Gen) - } - // mixer ensures that all the words of a fpElement are set - var sampleScalars [nbSamples]fr.Element - - for i := 1; i <= nbSamples; i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() - } - - var result1, result2 G2Jac - for _, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&result1, int(c), samplePoints[:], scalars, false) - innerMsmG2(&result2, int(c), samplePoints[:], scalars, false) - if !result1.Equal(&result2) { - return false - } - } - return true - }, - genScalar, - )) - // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index 5bd22872c9..e13ca90588 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -99,54 +99,56 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul C = c } } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } return C } - // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. - // nbSplits := 1 C := bestC(nbPoints) nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar if (fr.Limbs*64)%C != 0 { nbChunks++ } + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split + if config.NbTasks > 1 && nbChunks < config.NbTasks { + // before spliting, let's see if we endup with more tasks than thread; + cSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) + if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit++ + } + nbTasksPostSplit := nbChunksPostSplit * 2 + if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + // if postSplit we still have less tasks than available CPU + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + config.NbTasks /= 2 + var _p G1Jac + chDone := make(chan struct{}, 1) + go func() { + innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + } + + innerMsmG1(p, int(C), points, scalars, config) + + return p, nil +} + +func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) { // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG1(p, int(C), points, digits, splitFirstChunk) - // we have nbSplits intermediate results that we must sum together. - - // _p := make([]G1Jac, nbSplits - 1) - // chDone := make(chan int, nbSplits - 1) - // for i:=0; i < nbSplits-1; i++ { - // start := i * nbPoints - // end := start + nbPoints - // go func(start, end, i int) { - // innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - // chDone <- i - // }(start, end, i) - // } - - // innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - // for i:=0; i < nbSplits-1; i++ { - // done := <-chDone - // p.AddAssign(&_p[done]) - // } - // close(chDone) - return p, nil -} - -func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: @@ -345,54 +347,56 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul C = c } } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } return C } - // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. - // nbSplits := 1 C := bestC(nbPoints) nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar if (fr.Limbs*64)%C != 0 { nbChunks++ } + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split + if config.NbTasks > 1 && nbChunks < config.NbTasks { + // before spliting, let's see if we endup with more tasks than thread; + cSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) + if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit++ + } + nbTasksPostSplit := nbChunksPostSplit * 2 + if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + // if postSplit we still have less tasks than available CPU + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + config.NbTasks /= 2 + var _p G2Jac + chDone := make(chan struct{}, 1) + go func() { + innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + } + + innerMsmG2(p, int(C), points, scalars, config) + + return p, nil +} + +func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) { // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG2(p, int(C), points, digits, splitFirstChunk) - // we have nbSplits intermediate results that we must sum together. - - // _p := make([]G2Jac, nbSplits - 1) - // chDone := make(chan int, nbSplits - 1) - // for i:=0; i < nbSplits-1; i++ { - // start := i * nbPoints - // end := start + nbPoints - // go func(start, end, i int) { - // innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - // chDone <- i - // }(start, end, i) - // } - - // innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - // for i:=0; i < nbSplits-1; i++ { - // done := <-chDone - // p.AddAssign(&_p[done]) - // } - // close(chDone) - return p, nil -} - -func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index 4b357be4d9..b44ec363a2 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -94,8 +94,7 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true) + innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -130,14 +129,9 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - results := make([]G1Jac, len(cRange)+1) + results := make([]G1Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true) - } + innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -171,14 +165,9 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - results := make([]G1Jac, len(cRange)+1) + results := make([]G1Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) - } + innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -190,39 +179,6 @@ func TestMultiExpG1(t *testing.T) { genScalar, )) - properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( - func(mixer fr.Element) bool { - // multi exp points - var samplePoints [nbSamples]G1Affine - var g G1Jac - g.Set(&g1Gen) - for i := 1; i <= nbSamples; i++ { - samplePoints[i-1].FromJacobian(&g) - g.AddAssign(&g1Gen) - } - // mixer ensures that all the words of a fpElement are set - var sampleScalars [nbSamples]fr.Element - - for i := 1; i <= nbSamples; i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() - } - - var result1, result2 G1Jac - for _, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&result1, int(c), samplePoints[:], scalars, false) - innerMsmG1(&result2, int(c), samplePoints[:], scalars, false) - if !result1.Equal(&result2) { - return false - } - } - return true - }, - genScalar, - )) - // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -420,8 +376,7 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true) + innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -454,14 +409,9 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - results := make([]G2Jac, len(cRange)+1) + results := make([]G2Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true) - } + innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -495,14 +445,9 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - results := make([]G2Jac, len(cRange)+1) + results := make([]G2Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) - } + innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -514,39 +459,6 @@ func TestMultiExpG2(t *testing.T) { genScalar, )) - properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( - func(mixer fr.Element) bool { - // multi exp points - var samplePoints [nbSamples]G2Affine - var g G2Jac - g.Set(&g2Gen) - for i := 1; i <= nbSamples; i++ { - samplePoints[i-1].FromJacobian(&g) - g.AddAssign(&g2Gen) - } - // mixer ensures that all the words of a fpElement are set - var sampleScalars [nbSamples]fr.Element - - for i := 1; i <= nbSamples; i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() - } - - var result1, result2 G2Jac - for _, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&result1, int(c), samplePoints[:], scalars, false) - innerMsmG2(&result2, int(c), samplePoints[:], scalars, false) - if !result1.Equal(&result2) { - return false - } - } - return true - }, - genScalar, - )) - // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index 81dc28638c..d73bb2783e 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -99,54 +99,56 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul C = c } } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } return C } - // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. - // nbSplits := 1 C := bestC(nbPoints) nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar if (fr.Limbs*64)%C != 0 { nbChunks++ } + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split + if config.NbTasks > 1 && nbChunks < config.NbTasks { + // before spliting, let's see if we endup with more tasks than thread; + cSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) + if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit++ + } + nbTasksPostSplit := nbChunksPostSplit * 2 + if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + // if postSplit we still have less tasks than available CPU + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + config.NbTasks /= 2 + var _p G1Jac + chDone := make(chan struct{}, 1) + go func() { + innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + } + + innerMsmG1(p, int(C), points, scalars, config) + + return p, nil +} + +func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) { // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG1(p, int(C), points, digits, splitFirstChunk) - // we have nbSplits intermediate results that we must sum together. - - // _p := make([]G1Jac, nbSplits - 1) - // chDone := make(chan int, nbSplits - 1) - // for i:=0; i < nbSplits-1; i++ { - // start := i * nbPoints - // end := start + nbPoints - // go func(start, end, i int) { - // innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - // chDone <- i - // }(start, end, i) - // } - - // innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - // for i:=0; i < nbSplits-1; i++ { - // done := <-chDone - // p.AddAssign(&_p[done]) - // } - // close(chDone) - return p, nil -} - -func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: @@ -345,54 +347,56 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul C = c } } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } return C } - // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. - // nbSplits := 1 C := bestC(nbPoints) nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar if (fr.Limbs*64)%C != 0 { nbChunks++ } + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split + if config.NbTasks > 1 && nbChunks < config.NbTasks { + // before spliting, let's see if we endup with more tasks than thread; + cSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) + if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit++ + } + nbTasksPostSplit := nbChunksPostSplit * 2 + if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + // if postSplit we still have less tasks than available CPU + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + config.NbTasks /= 2 + var _p G2Jac + chDone := make(chan struct{}, 1) + go func() { + innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + } + + innerMsmG2(p, int(C), points, scalars, config) + + return p, nil +} + +func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) { // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG2(p, int(C), points, digits, splitFirstChunk) - // we have nbSplits intermediate results that we must sum together. - - // _p := make([]G2Jac, nbSplits - 1) - // chDone := make(chan int, nbSplits - 1) - // for i:=0; i < nbSplits-1; i++ { - // start := i * nbPoints - // end := start + nbPoints - // go func(start, end, i int) { - // innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - // chDone <- i - // }(start, end, i) - // } - - // innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - // for i:=0; i < nbSplits-1; i++ { - // done := <-chDone - // p.AddAssign(&_p[done]) - // } - // close(chDone) - return p, nil -} - -func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index 0773215145..8da7433fa9 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -94,8 +94,7 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true) + innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -130,14 +129,9 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - results := make([]G1Jac, len(cRange)+1) + results := make([]G1Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true) - } + innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -171,14 +165,9 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - results := make([]G1Jac, len(cRange)+1) + results := make([]G1Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) - } + innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -190,39 +179,6 @@ func TestMultiExpG1(t *testing.T) { genScalar, )) - properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( - func(mixer fr.Element) bool { - // multi exp points - var samplePoints [nbSamples]G1Affine - var g G1Jac - g.Set(&g1Gen) - for i := 1; i <= nbSamples; i++ { - samplePoints[i-1].FromJacobian(&g) - g.AddAssign(&g1Gen) - } - // mixer ensures that all the words of a fpElement are set - var sampleScalars [nbSamples]fr.Element - - for i := 1; i <= nbSamples; i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() - } - - var result1, result2 G1Jac - for _, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&result1, int(c), samplePoints[:], scalars, false) - innerMsmG1(&result2, int(c), samplePoints[:], scalars, false) - if !result1.Equal(&result2) { - return false - } - } - return true - }, - genScalar, - )) - // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -420,8 +376,7 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true) + innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -454,14 +409,9 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - results := make([]G2Jac, len(cRange)+1) + results := make([]G2Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true) - } + innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -495,14 +445,9 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - results := make([]G2Jac, len(cRange)+1) + results := make([]G2Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) - } + innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -514,39 +459,6 @@ func TestMultiExpG2(t *testing.T) { genScalar, )) - properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( - func(mixer fr.Element) bool { - // multi exp points - var samplePoints [nbSamples]G2Affine - var g G2Jac - g.Set(&g2Gen) - for i := 1; i <= nbSamples; i++ { - samplePoints[i-1].FromJacobian(&g) - g.AddAssign(&g2Gen) - } - // mixer ensures that all the words of a fpElement are set - var sampleScalars [nbSamples]fr.Element - - for i := 1; i <= nbSamples; i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() - } - - var result1, result2 G2Jac - for _, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&result1, int(c), samplePoints[:], scalars, false) - innerMsmG2(&result2, int(c), samplePoints[:], scalars, false) - if !result1.Equal(&result2) { - return false - } - } - return true - }, - genScalar, - )) - // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index 56342df6d4..ceee16e7fd 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -99,54 +99,56 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul C = c } } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } return C } - // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. - // nbSplits := 1 C := bestC(nbPoints) nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar if (fr.Limbs*64)%C != 0 { nbChunks++ } + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split + if config.NbTasks > 1 && nbChunks < config.NbTasks { + // before spliting, let's see if we endup with more tasks than thread; + cSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) + if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit++ + } + nbTasksPostSplit := nbChunksPostSplit * 2 + if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + // if postSplit we still have less tasks than available CPU + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + config.NbTasks /= 2 + var _p G1Jac + chDone := make(chan struct{}, 1) + go func() { + innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + } + + innerMsmG1(p, int(C), points, scalars, config) + + return p, nil +} + +func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) { // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG1(p, int(C), points, digits, splitFirstChunk) - // we have nbSplits intermediate results that we must sum together. - - // _p := make([]G1Jac, nbSplits - 1) - // chDone := make(chan int, nbSplits - 1) - // for i:=0; i < nbSplits-1; i++ { - // start := i * nbPoints - // end := start + nbPoints - // go func(start, end, i int) { - // innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - // chDone <- i - // }(start, end, i) - // } - - // innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - // for i:=0; i < nbSplits-1; i++ { - // done := <-chDone - // p.AddAssign(&_p[done]) - // } - // close(chDone) - return p, nil -} - -func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: @@ -345,54 +347,56 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul C = c } } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } return C } - // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. - // nbSplits := 1 C := bestC(nbPoints) nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar if (fr.Limbs*64)%C != 0 { nbChunks++ } + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split + if config.NbTasks > 1 && nbChunks < config.NbTasks { + // before spliting, let's see if we endup with more tasks than thread; + cSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) + if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit++ + } + nbTasksPostSplit := nbChunksPostSplit * 2 + if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + // if postSplit we still have less tasks than available CPU + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + config.NbTasks /= 2 + var _p G2Jac + chDone := make(chan struct{}, 1) + go func() { + innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + } + + innerMsmG2(p, int(C), points, scalars, config) + + return p, nil +} + +func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) { // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG2(p, int(C), points, digits, splitFirstChunk) - // we have nbSplits intermediate results that we must sum together. - - // _p := make([]G2Jac, nbSplits - 1) - // chDone := make(chan int, nbSplits - 1) - // for i:=0; i < nbSplits-1; i++ { - // start := i * nbPoints - // end := start + nbPoints - // go func(start, end, i int) { - // innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - // chDone <- i - // }(start, end, i) - // } - - // innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - // for i:=0; i < nbSplits-1; i++ { - // done := <-chDone - // p.AddAssign(&_p[done]) - // } - // close(chDone) - return p, nil -} - -func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index 9ebe0c4217..3d99991dc1 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -94,8 +94,7 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true) + innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -130,14 +129,9 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - results := make([]G1Jac, len(cRange)+1) + results := make([]G1Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true) - } + innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -171,14 +165,9 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - results := make([]G1Jac, len(cRange)+1) + results := make([]G1Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) - } + innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -190,39 +179,6 @@ func TestMultiExpG1(t *testing.T) { genScalar, )) - properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( - func(mixer fr.Element) bool { - // multi exp points - var samplePoints [nbSamples]G1Affine - var g G1Jac - g.Set(&g1Gen) - for i := 1; i <= nbSamples; i++ { - samplePoints[i-1].FromJacobian(&g) - g.AddAssign(&g1Gen) - } - // mixer ensures that all the words of a fpElement are set - var sampleScalars [nbSamples]fr.Element - - for i := 1; i <= nbSamples; i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() - } - - var result1, result2 G1Jac - for _, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&result1, int(c), samplePoints[:], scalars, false) - innerMsmG1(&result2, int(c), samplePoints[:], scalars, false) - if !result1.Equal(&result2) { - return false - } - } - return true - }, - genScalar, - )) - // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -420,8 +376,7 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true) + innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -454,14 +409,9 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - results := make([]G2Jac, len(cRange)+1) + results := make([]G2Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true) - } + innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -495,14 +445,9 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - results := make([]G2Jac, len(cRange)+1) + results := make([]G2Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) - } + innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -514,39 +459,6 @@ func TestMultiExpG2(t *testing.T) { genScalar, )) - properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( - func(mixer fr.Element) bool { - // multi exp points - var samplePoints [nbSamples]G2Affine - var g G2Jac - g.Set(&g2Gen) - for i := 1; i <= nbSamples; i++ { - samplePoints[i-1].FromJacobian(&g) - g.AddAssign(&g2Gen) - } - // mixer ensures that all the words of a fpElement are set - var sampleScalars [nbSamples]fr.Element - - for i := 1; i <= nbSamples; i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() - } - - var result1, result2 G2Jac - for _, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&result1, int(c), samplePoints[:], scalars, false) - innerMsmG2(&result2, int(c), samplePoints[:], scalars, false) - if !result1.Equal(&result2) { - return false - } - } - return true - }, - genScalar, - )) - // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index 64b2883493..1167251103 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -99,54 +99,56 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul C = c } } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } return C } - // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. - // nbSplits := 1 C := bestC(nbPoints) nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar if (fr.Limbs*64)%C != 0 { nbChunks++ } + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split + if config.NbTasks > 1 && nbChunks < config.NbTasks { + // before spliting, let's see if we endup with more tasks than thread; + cSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) + if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit++ + } + nbTasksPostSplit := nbChunksPostSplit * 2 + if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + // if postSplit we still have less tasks than available CPU + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + config.NbTasks /= 2 + var _p G1Jac + chDone := make(chan struct{}, 1) + go func() { + innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + } + + innerMsmG1(p, int(C), points, scalars, config) + + return p, nil +} + +func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) { // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG1(p, int(C), points, digits, splitFirstChunk) - // we have nbSplits intermediate results that we must sum together. - - // _p := make([]G1Jac, nbSplits - 1) - // chDone := make(chan int, nbSplits - 1) - // for i:=0; i < nbSplits-1; i++ { - // start := i * nbPoints - // end := start + nbPoints - // go func(start, end, i int) { - // innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - // chDone <- i - // }(start, end, i) - // } - - // innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - // for i:=0; i < nbSplits-1; i++ { - // done := <-chDone - // p.AddAssign(&_p[done]) - // } - // close(chDone) - return p, nil -} - -func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: @@ -345,54 +347,56 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul C = c } } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } return C } - // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. - // nbSplits := 1 C := bestC(nbPoints) nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar if (fr.Limbs*64)%C != 0 { nbChunks++ } + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split + if config.NbTasks > 1 && nbChunks < config.NbTasks { + // before spliting, let's see if we endup with more tasks than thread; + cSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) + if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit++ + } + nbTasksPostSplit := nbChunksPostSplit * 2 + if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + // if postSplit we still have less tasks than available CPU + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + config.NbTasks /= 2 + var _p G2Jac + chDone := make(chan struct{}, 1) + go func() { + innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + } + + innerMsmG2(p, int(C), points, scalars, config) + + return p, nil +} + +func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) { // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG2(p, int(C), points, digits, splitFirstChunk) - // we have nbSplits intermediate results that we must sum together. - - // _p := make([]G2Jac, nbSplits - 1) - // chDone := make(chan int, nbSplits - 1) - // for i:=0; i < nbSplits-1; i++ { - // start := i * nbPoints - // end := start + nbPoints - // go func(start, end, i int) { - // innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - // chDone <- i - // }(start, end, i) - // } - - // innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - // for i:=0; i < nbSplits-1; i++ { - // done := <-chDone - // p.AddAssign(&_p[done]) - // } - // close(chDone) - return p, nil -} - -func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index f77115cab8..e360f2f20f 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -94,8 +94,7 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true) + innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -130,14 +129,9 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - results := make([]G1Jac, len(cRange)+1) + results := make([]G1Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true) - } + innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -171,14 +165,9 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - results := make([]G1Jac, len(cRange)+1) + results := make([]G1Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) - } + innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -190,39 +179,6 @@ func TestMultiExpG1(t *testing.T) { genScalar, )) - properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( - func(mixer fr.Element) bool { - // multi exp points - var samplePoints [nbSamples]G1Affine - var g G1Jac - g.Set(&g1Gen) - for i := 1; i <= nbSamples; i++ { - samplePoints[i-1].FromJacobian(&g) - g.AddAssign(&g1Gen) - } - // mixer ensures that all the words of a fpElement are set - var sampleScalars [nbSamples]fr.Element - - for i := 1; i <= nbSamples; i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() - } - - var result1, result2 G1Jac - for _, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&result1, int(c), samplePoints[:], scalars, false) - innerMsmG1(&result2, int(c), samplePoints[:], scalars, false) - if !result1.Equal(&result2) { - return false - } - } - return true - }, - genScalar, - )) - // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -420,8 +376,7 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true) + innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -454,14 +409,9 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - results := make([]G2Jac, len(cRange)+1) + results := make([]G2Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true) - } + innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -495,14 +445,9 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - results := make([]G2Jac, len(cRange)+1) + results := make([]G2Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) - } + innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -514,39 +459,6 @@ func TestMultiExpG2(t *testing.T) { genScalar, )) - properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( - func(mixer fr.Element) bool { - // multi exp points - var samplePoints [nbSamples]G2Affine - var g G2Jac - g.Set(&g2Gen) - for i := 1; i <= nbSamples; i++ { - samplePoints[i-1].FromJacobian(&g) - g.AddAssign(&g2Gen) - } - // mixer ensures that all the words of a fpElement are set - var sampleScalars [nbSamples]fr.Element - - for i := 1; i <= nbSamples; i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() - } - - var result1, result2 G2Jac - for _, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&result1, int(c), samplePoints[:], scalars, false) - innerMsmG2(&result2, int(c), samplePoints[:], scalars, false) - if !result1.Equal(&result2) { - return false - } - } - return true - }, - genScalar, - )) - // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index 762cecb6c9..3ca13bafc4 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -99,54 +99,56 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul C = c } } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } return C } - // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. - // nbSplits := 1 C := bestC(nbPoints) nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar if (fr.Limbs*64)%C != 0 { nbChunks++ } + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split + if config.NbTasks > 1 && nbChunks < config.NbTasks { + // before spliting, let's see if we endup with more tasks than thread; + cSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) + if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit++ + } + nbTasksPostSplit := nbChunksPostSplit * 2 + if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + // if postSplit we still have less tasks than available CPU + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + config.NbTasks /= 2 + var _p G1Jac + chDone := make(chan struct{}, 1) + go func() { + innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + } + + innerMsmG1(p, int(C), points, scalars, config) + + return p, nil +} + +func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) { // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG1(p, int(C), points, digits, splitFirstChunk) - // we have nbSplits intermediate results that we must sum together. - - // _p := make([]G1Jac, nbSplits - 1) - // chDone := make(chan int, nbSplits - 1) - // for i:=0; i < nbSplits-1; i++ { - // start := i * nbPoints - // end := start + nbPoints - // go func(start, end, i int) { - // innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - // chDone <- i - // }(start, end, i) - // } - - // innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - // for i:=0; i < nbSplits-1; i++ { - // done := <-chDone - // p.AddAssign(&_p[done]) - // } - // close(chDone) - return p, nil -} - -func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: @@ -308,54 +310,56 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul C = c } } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } return C } - // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. - // nbSplits := 1 C := bestC(nbPoints) nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar if (fr.Limbs*64)%C != 0 { nbChunks++ } + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split + if config.NbTasks > 1 && nbChunks < config.NbTasks { + // before spliting, let's see if we endup with more tasks than thread; + cSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) + if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit++ + } + nbTasksPostSplit := nbChunksPostSplit * 2 + if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + // if postSplit we still have less tasks than available CPU + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + config.NbTasks /= 2 + var _p G2Jac + chDone := make(chan struct{}, 1) + go func() { + innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + } + + innerMsmG2(p, int(C), points, scalars, config) + + return p, nil +} + +func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) { // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG2(p, int(C), points, digits, splitFirstChunk) - // we have nbSplits intermediate results that we must sum together. - - // _p := make([]G2Jac, nbSplits - 1) - // chDone := make(chan int, nbSplits - 1) - // for i:=0; i < nbSplits-1; i++ { - // start := i * nbPoints - // end := start + nbPoints - // go func(start, end, i int) { - // innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - // chDone <- i - // }(start, end, i) - // } - - // innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - // for i:=0; i < nbSplits-1; i++ { - // done := <-chDone - // p.AddAssign(&_p[done]) - // } - // close(chDone) - return p, nil -} - -func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index 6946fe3b65..d4e07c80ce 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -94,8 +94,7 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true) + innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -130,14 +129,9 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - results := make([]G1Jac, len(cRange)+1) + results := make([]G1Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true) - } + innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -171,14 +165,9 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - results := make([]G1Jac, len(cRange)+1) + results := make([]G1Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) - } + innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -190,39 +179,6 @@ func TestMultiExpG1(t *testing.T) { genScalar, )) - properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( - func(mixer fr.Element) bool { - // multi exp points - var samplePoints [nbSamples]G1Affine - var g G1Jac - g.Set(&g1Gen) - for i := 1; i <= nbSamples; i++ { - samplePoints[i-1].FromJacobian(&g) - g.AddAssign(&g1Gen) - } - // mixer ensures that all the words of a fpElement are set - var sampleScalars [nbSamples]fr.Element - - for i := 1; i <= nbSamples; i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() - } - - var result1, result2 G1Jac - for _, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&result1, int(c), samplePoints[:], scalars, false) - innerMsmG1(&result2, int(c), samplePoints[:], scalars, false) - if !result1.Equal(&result2) { - return false - } - } - return true - }, - genScalar, - )) - // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -420,8 +376,7 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true) + innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -454,14 +409,9 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - results := make([]G2Jac, len(cRange)+1) + results := make([]G2Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true) - } + innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -495,14 +445,9 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - results := make([]G2Jac, len(cRange)+1) + results := make([]G2Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) - } + innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -514,39 +459,6 @@ func TestMultiExpG2(t *testing.T) { genScalar, )) - properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( - func(mixer fr.Element) bool { - // multi exp points - var samplePoints [nbSamples]G2Affine - var g G2Jac - g.Set(&g2Gen) - for i := 1; i <= nbSamples; i++ { - samplePoints[i-1].FromJacobian(&g) - g.AddAssign(&g2Gen) - } - // mixer ensures that all the words of a fpElement are set - var sampleScalars [nbSamples]fr.Element - - for i := 1; i <= nbSamples; i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() - } - - var result1, result2 G2Jac - for _, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&result1, int(c), samplePoints[:], scalars, false) - innerMsmG2(&result2, int(c), samplePoints[:], scalars, false) - if !result1.Equal(&result2) { - return false - } - } - return true - }, - genScalar, - )) - // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index 649b86facf..87598422a7 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -99,54 +99,56 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul C = c } } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } return C } - // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. - // nbSplits := 1 C := bestC(nbPoints) nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar if (fr.Limbs*64)%C != 0 { nbChunks++ } + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split + if config.NbTasks > 1 && nbChunks < config.NbTasks { + // before spliting, let's see if we endup with more tasks than thread; + cSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) + if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit++ + } + nbTasksPostSplit := nbChunksPostSplit * 2 + if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + // if postSplit we still have less tasks than available CPU + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + config.NbTasks /= 2 + var _p G1Jac + chDone := make(chan struct{}, 1) + go func() { + innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + } + + innerMsmG1(p, int(C), points, scalars, config) + + return p, nil +} + +func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) { // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG1(p, int(C), points, digits, splitFirstChunk) - // we have nbSplits intermediate results that we must sum together. - - // _p := make([]G1Jac, nbSplits - 1) - // chDone := make(chan int, nbSplits - 1) - // for i:=0; i < nbSplits-1; i++ { - // start := i * nbPoints - // end := start + nbPoints - // go func(start, end, i int) { - // innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - // chDone <- i - // }(start, end, i) - // } - - // innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - // for i:=0; i < nbSplits-1; i++ { - // done := <-chDone - // p.AddAssign(&_p[done]) - // } - // close(chDone) - return p, nil -} - -func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: @@ -309,54 +311,56 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul C = c } } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } return C } - // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. - // nbSplits := 1 C := bestC(nbPoints) nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar if (fr.Limbs*64)%C != 0 { nbChunks++ } + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split + if config.NbTasks > 1 && nbChunks < config.NbTasks { + // before spliting, let's see if we endup with more tasks than thread; + cSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) + if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit++ + } + nbTasksPostSplit := nbChunksPostSplit * 2 + if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + // if postSplit we still have less tasks than available CPU + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + config.NbTasks /= 2 + var _p G2Jac + chDone := make(chan struct{}, 1) + go func() { + innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + } + + innerMsmG2(p, int(C), points, scalars, config) + + return p, nil +} + +func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) { // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG2(p, int(C), points, digits, splitFirstChunk) - // we have nbSplits intermediate results that we must sum together. - - // _p := make([]G2Jac, nbSplits - 1) - // chDone := make(chan int, nbSplits - 1) - // for i:=0; i < nbSplits-1; i++ { - // start := i * nbPoints - // end := start + nbPoints - // go func(start, end, i int) { - // innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - // chDone <- i - // }(start, end, i) - // } - - // innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - // for i:=0; i < nbSplits-1; i++ { - // done := <-chDone - // p.AddAssign(&_p[done]) - // } - // close(chDone) - return p, nil -} - -func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index 73217d1e6a..64f9ca10ec 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -94,8 +94,7 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true) + innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -130,14 +129,9 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - results := make([]G1Jac, len(cRange)+1) + results := make([]G1Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true) - } + innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -171,14 +165,9 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - results := make([]G1Jac, len(cRange)+1) + results := make([]G1Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) - } + innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -190,39 +179,6 @@ func TestMultiExpG1(t *testing.T) { genScalar, )) - properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( - func(mixer fr.Element) bool { - // multi exp points - var samplePoints [nbSamples]G1Affine - var g G1Jac - g.Set(&g1Gen) - for i := 1; i <= nbSamples; i++ { - samplePoints[i-1].FromJacobian(&g) - g.AddAssign(&g1Gen) - } - // mixer ensures that all the words of a fpElement are set - var sampleScalars [nbSamples]fr.Element - - for i := 1; i <= nbSamples; i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() - } - - var result1, result2 G1Jac - for _, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&result1, int(c), samplePoints[:], scalars, false) - innerMsmG1(&result2, int(c), samplePoints[:], scalars, false) - if !result1.Equal(&result2) { - return false - } - } - return true - }, - genScalar, - )) - // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -420,8 +376,7 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true) + innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -454,14 +409,9 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - results := make([]G2Jac, len(cRange)+1) + results := make([]G2Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true) - } + innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -495,14 +445,9 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - results := make([]G2Jac, len(cRange)+1) + results := make([]G2Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) - } + innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -514,39 +459,6 @@ func TestMultiExpG2(t *testing.T) { genScalar, )) - properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( - func(mixer fr.Element) bool { - // multi exp points - var samplePoints [nbSamples]G2Affine - var g G2Jac - g.Set(&g2Gen) - for i := 1; i <= nbSamples; i++ { - samplePoints[i-1].FromJacobian(&g) - g.AddAssign(&g2Gen) - } - // mixer ensures that all the words of a fpElement are set - var sampleScalars [nbSamples]fr.Element - - for i := 1; i <= nbSamples; i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() - } - - var result1, result2 G2Jac - for _, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&result1, int(c), samplePoints[:], scalars, false) - innerMsmG2(&result2, int(c), samplePoints[:], scalars, false) - if !result1.Equal(&result2) { - return false - } - } - return true - }, - genScalar, - )) - // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index 9eb52c0130..d5165db4a5 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -99,54 +99,56 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul C = c } } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } return C } - // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. - // nbSplits := 1 C := bestC(nbPoints) nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar if (fr.Limbs*64)%C != 0 { nbChunks++ } + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split + if config.NbTasks > 1 && nbChunks < config.NbTasks { + // before spliting, let's see if we endup with more tasks than thread; + cSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) + if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit++ + } + nbTasksPostSplit := nbChunksPostSplit * 2 + if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + // if postSplit we still have less tasks than available CPU + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + config.NbTasks /= 2 + var _p G1Jac + chDone := make(chan struct{}, 1) + go func() { + innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + } + + innerMsmG1(p, int(C), points, scalars, config) + + return p, nil +} + +func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) { // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG1(p, int(C), points, digits, splitFirstChunk) - // we have nbSplits intermediate results that we must sum together. - - // _p := make([]G1Jac, nbSplits - 1) - // chDone := make(chan int, nbSplits - 1) - // for i:=0; i < nbSplits-1; i++ { - // start := i * nbPoints - // end := start + nbPoints - // go func(start, end, i int) { - // innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - // chDone <- i - // }(start, end, i) - // } - - // innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - // for i:=0; i < nbSplits-1; i++ { - // done := <-chDone - // p.AddAssign(&_p[done]) - // } - // close(chDone) - return p, nil -} - -func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: @@ -309,54 +311,56 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul C = c } } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } return C } - // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. - // nbSplits := 1 C := bestC(nbPoints) nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar if (fr.Limbs*64)%C != 0 { nbChunks++ } + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split + if config.NbTasks > 1 && nbChunks < config.NbTasks { + // before spliting, let's see if we endup with more tasks than thread; + cSplit := bestC(nbPoints / 2) + nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) + if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit++ + } + nbTasksPostSplit := nbChunksPostSplit * 2 + if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + // if postSplit we still have less tasks than available CPU + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + config.NbTasks /= 2 + var _p G2Jac + chDone := make(chan struct{}, 1) + go func() { + innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + } + + innerMsmG2(p, int(C), points, scalars, config) + + return p, nil +} + +func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) { // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsmG2(p, int(C), points, digits, splitFirstChunk) - // we have nbSplits intermediate results that we must sum together. - - // _p := make([]G2Jac, nbSplits - 1) - // chDone := make(chan int, nbSplits - 1) - // for i:=0; i < nbSplits-1; i++ { - // start := i * nbPoints - // end := start + nbPoints - // go func(start, end, i int) { - // innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - // chDone <- i - // }(start, end, i) - // } - - // innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - // for i:=0; i < nbSplits-1; i++ { - // done := <-chDone - // p.AddAssign(&_p[done]) - // } - // close(chDone) - return p, nil -} - -func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) { switch c { case 4: diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index e36993cfff..613a4bac53 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -94,8 +94,7 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true) + innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -130,14 +129,9 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - results := make([]G1Jac, len(cRange)+1) + results := make([]G1Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true) - } + innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -171,14 +165,9 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - results := make([]G1Jac, len(cRange)+1) + results := make([]G1Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) - } + innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -190,39 +179,6 @@ func TestMultiExpG1(t *testing.T) { genScalar, )) - properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( - func(mixer fr.Element) bool { - // multi exp points - var samplePoints [nbSamples]G1Affine - var g G1Jac - g.Set(&g1Gen) - for i := 1; i <= nbSamples; i++ { - samplePoints[i-1].FromJacobian(&g) - g.AddAssign(&g1Gen) - } - // mixer ensures that all the words of a fpElement are set - var sampleScalars [nbSamples]fr.Element - - for i := 1; i <= nbSamples; i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() - } - - var result1, result2 G1Jac - for _, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG1(&result1, int(c), samplePoints[:], scalars, false) - innerMsmG1(&result2, int(c), samplePoints[:], scalars, false) - if !result1.Equal(&result2) { - return false - } - } - return true - }, - genScalar, - )) - // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( @@ -420,8 +376,7 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true) + innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -454,14 +409,9 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - results := make([]G2Jac, len(cRange)+1) + results := make([]G2Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true) - } + innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -495,14 +445,9 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - results := make([]G2Jac, len(cRange)+1) + results := make([]G2Jac, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) - } + innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -514,39 +459,6 @@ func TestMultiExpG2(t *testing.T) { genScalar, )) - properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( - func(mixer fr.Element) bool { - // multi exp points - var samplePoints [nbSamples]G2Affine - var g G2Jac - g.Set(&g2Gen) - for i := 1; i <= nbSamples; i++ { - samplePoints[i-1].FromJacobian(&g) - g.AddAssign(&g2Gen) - } - // mixer ensures that all the words of a fpElement are set - var sampleScalars [nbSamples]fr.Element - - for i := 1; i <= nbSamples; i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() - } - - var result1, result2 G2Jac - for _, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsmG2(&result1, int(c), samplePoints[:], scalars, false) - innerMsmG2(&result2, int(c), samplePoints[:], scalars, false) - if !result1.Equal(&result2) { - return false - } - } - return true - }, - genScalar, - )) - // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index 9e5ce00b70..67ed6b9027 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -368,56 +368,58 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem C = c } } - // empirical, needs to be tuned. - // if C > 16 && nbPoints < 1 << 23 { - // C = 16 - // } return C } - // TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars. - // nbSplits := 1 C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs * 64) % C != 0 { - nbChunks ++ + nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ } + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split + if config.NbTasks > 1 && nbChunks < config.NbTasks { + // before spliting, let's see if we endup with more tasks than thread; + cSplit := bestC(nbPoints/2) + nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) + if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit++ + } + nbTasksPostSplit := nbChunksPostSplit*2 + if (nbTasksPostSplit <= config.NbTasks) || ( nbTasksPostSplit - config.NbTasks ) <= ( config.NbTasks - nbChunks) { + // if postSplit we still have less tasks than available CPU + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + config.NbTasks /= 2 + var _p {{ $.TJacobian }} + chDone := make(chan struct{}, 1) + go func() { + innerMsm{{ $.UPointName }}(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + close(chDone) + }() + innerMsm{{ $.UPointName }}(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + <-chDone + p.AddAssign(&_p) + return p, nil + } + } + + innerMsm{{ $.UPointName }}(p, int(C), points, scalars, config) + + return p, nil +} + + +func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig) { // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - innerMsm{{ $.UPointName }}(p, int(C), points, digits, splitFirstChunk) - // we have nbSplits intermediate results that we must sum together. - - - // _p := make([]{{ $.TJacobian }}, nbSplits - 1) - // chDone := make(chan int, nbSplits - 1) - // for i:=0; i < nbSplits-1; i++ { - // start := i * nbPoints - // end := start + nbPoints - // go func(start, end, i int) { - // innerMsm{{ $.UPointName }}(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) - // chDone <- i - // }(start, end, i) - // } - - // innerMsm{{ $.UPointName }}(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk) - // for i:=0; i < nbSplits-1; i++ { - // done := <-chDone - // p.AddAssign(&_p[done]) - // } - // close(chDone) - return p, nil -} - -func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, digits []uint32, splitFirstChunk bool) { {{- /* TODO @gbotrel need to deal with cases where lastC == 1 ; having a whole chunk with 1-bit window makes no sense */}} {{- /* also need to determine until which window size the ext-jacobian version is worth it. */}} switch c { diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index 25102e24d0..e6cd2b0338 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -10,11 +10,11 @@ import ( "fmt" "time" + "runtime" "math/rand" rrand "crypto/rand" "math/big" "testing" - "runtime" "math/bits" "sync" "math" @@ -93,8 +93,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { FromMont() } - scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) - innerMsm{{ toUpper $.PointName }}(&r16, 16, samplePointsLarge[:], scalars16, true) + innerMsm{{ toUpper $.PointName }}(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks:runtime.NumCPU()}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) @@ -138,14 +137,9 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { } - results := make([]{{ $.TJacobian }}, len(cRange) + 1) + results := make([]{{ $.TJacobian }}, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsm{{ toUpper $.PointName }}(&results[i], int(c), samplePoints[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsm{{ toUpper $.PointName }}(&results[len(results)-1], 16, samplePoints[:], scalars, true) - } + innerMsm{{ toUpper $.PointName }}(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks:runtime.NumCPU()}) } for i:=1; i < len(results);i++ { if !results[i].Equal(&results[i-1]) { @@ -179,14 +173,9 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { FromMont() } - results := make([]{{ $.TJacobian }}, len(cRange)+1) + results := make([]{{ $.TJacobian }}, len(cRange)) for i, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsm{{ toUpper $.PointName }}(&results[i], int(c), samplePointsZero[:], scalars, false) - if c == 16 { - // split the first chunk - innerMsm{{ toUpper $.PointName }}(&results[len(results)-1], 16, samplePointsZero[:], scalars, true) - } + innerMsm{{ toUpper $.PointName }}(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks:runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -199,40 +188,6 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { )) - properties.Property(fmt.Sprintf("[{{ toUpper $.PointName }}] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll( - func(mixer fr.Element) bool { - // multi exp points - var samplePoints [nbSamples]{{ $.TAffine }} - var g {{ $.TJacobian }} - g.Set(&{{ toLower .PointName}}Gen) - for i := 1; i <= nbSamples; i++ { - samplePoints[i-1].FromJacobian(&g) - g.AddAssign(&{{ toLower .PointName}}Gen) - } - // mixer ensures that all the words of a fpElement are set - var sampleScalars [nbSamples]fr.Element - - for i := 1; i <= nbSamples; i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() - } - - var result1, result2 {{ $.TJacobian }} - for _, c := range cRange { - scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU()) - innerMsm{{ toUpper $.PointName }}(&result1, int(c), samplePoints[:], scalars, false) - innerMsm{{ toUpper $.PointName }}(&result2, int(c), samplePoints[:], scalars, false) - if !result1.Equal(&result2) { - return false - } - } - return true - }, - genScalar, - )) - - // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[{{ toUpper $.PointName }}] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( From 227a8f27c68f5ebadcf99aa359b7fc761d4830e8 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Fri, 11 Nov 2022 15:57:17 +0000 Subject: [PATCH 14/43] fix: restore previous way to generate scalars in benches --- ecc/bls12-377/multiexp_test.go | 63 +++++++++---------- ecc/bls12-378/multiexp_test.go | 63 +++++++++---------- ecc/bls12-381/multiexp_test.go | 63 +++++++++---------- ecc/bls24-315/multiexp_test.go | 63 +++++++++---------- ecc/bls24-317/multiexp_test.go | 63 +++++++++---------- ecc/bn254/multiexp_test.go | 63 +++++++++---------- ecc/bw6-633/multiexp_test.go | 63 +++++++++---------- ecc/bw6-756/multiexp_test.go | 63 +++++++++---------- ecc/bw6-761/multiexp_test.go | 63 +++++++++---------- .../ecc/template/tests/multiexp.go.tmpl | 41 ++++++------ 10 files changed, 300 insertions(+), 308 deletions(-) diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index 3da962b555..4b4406e922 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -17,9 +17,7 @@ package bls12377 import ( - rrand "crypto/rand" "fmt" - "math" "math/big" "math/bits" "math/rand" @@ -30,7 +28,6 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" - "github.com/consensys/gnark-crypto/internal/parallel" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" ) @@ -247,7 +244,7 @@ func BenchmarkMultiExpG1(b *testing.B) { } func BenchmarkMultiExpG1Reference(b *testing.B) { - const nbSamples = 1 << 23 + const nbSamples = 1 << 20 var ( samplePoints [nbSamples]G1Affine @@ -304,17 +301,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG1(samplePoints []G1Affine) { - max := new(big.Int).SetInt64(math.MaxInt64) - parallel.Execute(len(samplePoints), func(start, end int) { - r, _ := rrand.Int(rrand.Reader, max) - samplePoints[start].ScalarMultiplication(&g1GenAff, r) - rr := samplePoints[start].X - rr.SetOne() - for i := start + 1; i < end; i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) - } - }) + var r big.Int + r.SetString("340444420969191673093399857471996460938405", 10) + samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) + + one := samplePoints[0].X + one.SetOne() + + for i := 1; i < len(samplePoints); i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &one) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) + } } func TestMultiExpG2(t *testing.T) { @@ -527,7 +524,7 @@ func BenchmarkMultiExpG2(b *testing.B) { } func BenchmarkMultiExpG2Reference(b *testing.B) { - const nbSamples = 1 << 23 + const nbSamples = 1 << 20 var ( samplePoints [nbSamples]G2Affine @@ -584,24 +581,26 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG2(samplePoints []G2Affine) { - max := new(big.Int).SetInt64(math.MaxInt64) - parallel.Execute(len(samplePoints), func(start, end int) { - r, _ := rrand.Int(rrand.Reader, max) - samplePoints[start].ScalarMultiplication(&g2GenAff, r) - rr := samplePoints[start].X - rr.SetOne() - for i := start + 1; i < end; i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) - } - }) + var r big.Int + r.SetString("340444420969191673093399857471996460938405", 10) + samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) + + one := samplePoints[0].X + one.SetOne() + + for i := 1; i < len(samplePoints); i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &one) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) + } } func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - parallel.Execute(len(sampleScalars), func(start, end int) { - for i := start; i < end; i++ { - sampleScalars[i].SetRandom() - } - }) + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + for i := 1; i <= len(sampleScalars); i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } } diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index 1cf20b793f..b710acf39b 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -17,9 +17,7 @@ package bls12378 import ( - rrand "crypto/rand" "fmt" - "math" "math/big" "math/bits" "math/rand" @@ -30,7 +28,6 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-378/fr" - "github.com/consensys/gnark-crypto/internal/parallel" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" ) @@ -247,7 +244,7 @@ func BenchmarkMultiExpG1(b *testing.B) { } func BenchmarkMultiExpG1Reference(b *testing.B) { - const nbSamples = 1 << 23 + const nbSamples = 1 << 20 var ( samplePoints [nbSamples]G1Affine @@ -304,17 +301,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG1(samplePoints []G1Affine) { - max := new(big.Int).SetInt64(math.MaxInt64) - parallel.Execute(len(samplePoints), func(start, end int) { - r, _ := rrand.Int(rrand.Reader, max) - samplePoints[start].ScalarMultiplication(&g1GenAff, r) - rr := samplePoints[start].X - rr.SetOne() - for i := start + 1; i < end; i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) - } - }) + var r big.Int + r.SetString("340444420969191673093399857471996460938405", 10) + samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) + + one := samplePoints[0].X + one.SetOne() + + for i := 1; i < len(samplePoints); i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &one) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) + } } func TestMultiExpG2(t *testing.T) { @@ -527,7 +524,7 @@ func BenchmarkMultiExpG2(b *testing.B) { } func BenchmarkMultiExpG2Reference(b *testing.B) { - const nbSamples = 1 << 23 + const nbSamples = 1 << 20 var ( samplePoints [nbSamples]G2Affine @@ -584,24 +581,26 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG2(samplePoints []G2Affine) { - max := new(big.Int).SetInt64(math.MaxInt64) - parallel.Execute(len(samplePoints), func(start, end int) { - r, _ := rrand.Int(rrand.Reader, max) - samplePoints[start].ScalarMultiplication(&g2GenAff, r) - rr := samplePoints[start].X - rr.SetOne() - for i := start + 1; i < end; i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) - } - }) + var r big.Int + r.SetString("340444420969191673093399857471996460938405", 10) + samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) + + one := samplePoints[0].X + one.SetOne() + + for i := 1; i < len(samplePoints); i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &one) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) + } } func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - parallel.Execute(len(sampleScalars), func(start, end int) { - for i := start; i < end; i++ { - sampleScalars[i].SetRandom() - } - }) + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + for i := 1; i <= len(sampleScalars); i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } } diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index b44ec363a2..1f8539c0bf 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -17,9 +17,7 @@ package bls12381 import ( - rrand "crypto/rand" "fmt" - "math" "math/big" "math/bits" "math/rand" @@ -30,7 +28,6 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-381/fr" - "github.com/consensys/gnark-crypto/internal/parallel" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" ) @@ -247,7 +244,7 @@ func BenchmarkMultiExpG1(b *testing.B) { } func BenchmarkMultiExpG1Reference(b *testing.B) { - const nbSamples = 1 << 23 + const nbSamples = 1 << 20 var ( samplePoints [nbSamples]G1Affine @@ -304,17 +301,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG1(samplePoints []G1Affine) { - max := new(big.Int).SetInt64(math.MaxInt64) - parallel.Execute(len(samplePoints), func(start, end int) { - r, _ := rrand.Int(rrand.Reader, max) - samplePoints[start].ScalarMultiplication(&g1GenAff, r) - rr := samplePoints[start].X - rr.SetOne() - for i := start + 1; i < end; i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) - } - }) + var r big.Int + r.SetString("340444420969191673093399857471996460938405", 10) + samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) + + one := samplePoints[0].X + one.SetOne() + + for i := 1; i < len(samplePoints); i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &one) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) + } } func TestMultiExpG2(t *testing.T) { @@ -527,7 +524,7 @@ func BenchmarkMultiExpG2(b *testing.B) { } func BenchmarkMultiExpG2Reference(b *testing.B) { - const nbSamples = 1 << 23 + const nbSamples = 1 << 20 var ( samplePoints [nbSamples]G2Affine @@ -584,24 +581,26 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG2(samplePoints []G2Affine) { - max := new(big.Int).SetInt64(math.MaxInt64) - parallel.Execute(len(samplePoints), func(start, end int) { - r, _ := rrand.Int(rrand.Reader, max) - samplePoints[start].ScalarMultiplication(&g2GenAff, r) - rr := samplePoints[start].X - rr.SetOne() - for i := start + 1; i < end; i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) - } - }) + var r big.Int + r.SetString("340444420969191673093399857471996460938405", 10) + samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) + + one := samplePoints[0].X + one.SetOne() + + for i := 1; i < len(samplePoints); i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &one) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) + } } func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - parallel.Execute(len(sampleScalars), func(start, end int) { - for i := start; i < end; i++ { - sampleScalars[i].SetRandom() - } - }) + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + for i := 1; i <= len(sampleScalars); i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } } diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index 8da7433fa9..9307ba079d 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -17,9 +17,7 @@ package bls24315 import ( - rrand "crypto/rand" "fmt" - "math" "math/big" "math/bits" "math/rand" @@ -30,7 +28,6 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls24-315/fr" - "github.com/consensys/gnark-crypto/internal/parallel" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" ) @@ -247,7 +244,7 @@ func BenchmarkMultiExpG1(b *testing.B) { } func BenchmarkMultiExpG1Reference(b *testing.B) { - const nbSamples = 1 << 23 + const nbSamples = 1 << 20 var ( samplePoints [nbSamples]G1Affine @@ -304,17 +301,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG1(samplePoints []G1Affine) { - max := new(big.Int).SetInt64(math.MaxInt64) - parallel.Execute(len(samplePoints), func(start, end int) { - r, _ := rrand.Int(rrand.Reader, max) - samplePoints[start].ScalarMultiplication(&g1GenAff, r) - rr := samplePoints[start].X - rr.SetOne() - for i := start + 1; i < end; i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) - } - }) + var r big.Int + r.SetString("340444420969191673093399857471996460938405", 10) + samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) + + one := samplePoints[0].X + one.SetOne() + + for i := 1; i < len(samplePoints); i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &one) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) + } } func TestMultiExpG2(t *testing.T) { @@ -527,7 +524,7 @@ func BenchmarkMultiExpG2(b *testing.B) { } func BenchmarkMultiExpG2Reference(b *testing.B) { - const nbSamples = 1 << 23 + const nbSamples = 1 << 20 var ( samplePoints [nbSamples]G2Affine @@ -584,24 +581,26 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG2(samplePoints []G2Affine) { - max := new(big.Int).SetInt64(math.MaxInt64) - parallel.Execute(len(samplePoints), func(start, end int) { - r, _ := rrand.Int(rrand.Reader, max) - samplePoints[start].ScalarMultiplication(&g2GenAff, r) - rr := samplePoints[start].X - rr.SetOne() - for i := start + 1; i < end; i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) - } - }) + var r big.Int + r.SetString("340444420969191673093399857471996460938405", 10) + samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) + + one := samplePoints[0].X + one.SetOne() + + for i := 1; i < len(samplePoints); i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &one) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) + } } func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - parallel.Execute(len(sampleScalars), func(start, end int) { - for i := start; i < end; i++ { - sampleScalars[i].SetRandom() - } - }) + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + for i := 1; i <= len(sampleScalars); i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } } diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index 3d99991dc1..5945e42e8a 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -17,9 +17,7 @@ package bls24317 import ( - rrand "crypto/rand" "fmt" - "math" "math/big" "math/bits" "math/rand" @@ -30,7 +28,6 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls24-317/fr" - "github.com/consensys/gnark-crypto/internal/parallel" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" ) @@ -247,7 +244,7 @@ func BenchmarkMultiExpG1(b *testing.B) { } func BenchmarkMultiExpG1Reference(b *testing.B) { - const nbSamples = 1 << 23 + const nbSamples = 1 << 20 var ( samplePoints [nbSamples]G1Affine @@ -304,17 +301,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG1(samplePoints []G1Affine) { - max := new(big.Int).SetInt64(math.MaxInt64) - parallel.Execute(len(samplePoints), func(start, end int) { - r, _ := rrand.Int(rrand.Reader, max) - samplePoints[start].ScalarMultiplication(&g1GenAff, r) - rr := samplePoints[start].X - rr.SetOne() - for i := start + 1; i < end; i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) - } - }) + var r big.Int + r.SetString("340444420969191673093399857471996460938405", 10) + samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) + + one := samplePoints[0].X + one.SetOne() + + for i := 1; i < len(samplePoints); i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &one) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) + } } func TestMultiExpG2(t *testing.T) { @@ -527,7 +524,7 @@ func BenchmarkMultiExpG2(b *testing.B) { } func BenchmarkMultiExpG2Reference(b *testing.B) { - const nbSamples = 1 << 23 + const nbSamples = 1 << 20 var ( samplePoints [nbSamples]G2Affine @@ -584,24 +581,26 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG2(samplePoints []G2Affine) { - max := new(big.Int).SetInt64(math.MaxInt64) - parallel.Execute(len(samplePoints), func(start, end int) { - r, _ := rrand.Int(rrand.Reader, max) - samplePoints[start].ScalarMultiplication(&g2GenAff, r) - rr := samplePoints[start].X - rr.SetOne() - for i := start + 1; i < end; i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) - } - }) + var r big.Int + r.SetString("340444420969191673093399857471996460938405", 10) + samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) + + one := samplePoints[0].X + one.SetOne() + + for i := 1; i < len(samplePoints); i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &one) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) + } } func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - parallel.Execute(len(sampleScalars), func(start, end int) { - for i := start; i < end; i++ { - sampleScalars[i].SetRandom() - } - }) + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + for i := 1; i <= len(sampleScalars); i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } } diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index e360f2f20f..23dc3b5897 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -17,9 +17,7 @@ package bn254 import ( - rrand "crypto/rand" "fmt" - "math" "math/big" "math/bits" "math/rand" @@ -30,7 +28,6 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bn254/fr" - "github.com/consensys/gnark-crypto/internal/parallel" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" ) @@ -247,7 +244,7 @@ func BenchmarkMultiExpG1(b *testing.B) { } func BenchmarkMultiExpG1Reference(b *testing.B) { - const nbSamples = 1 << 23 + const nbSamples = 1 << 20 var ( samplePoints [nbSamples]G1Affine @@ -304,17 +301,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG1(samplePoints []G1Affine) { - max := new(big.Int).SetInt64(math.MaxInt64) - parallel.Execute(len(samplePoints), func(start, end int) { - r, _ := rrand.Int(rrand.Reader, max) - samplePoints[start].ScalarMultiplication(&g1GenAff, r) - rr := samplePoints[start].X - rr.SetOne() - for i := start + 1; i < end; i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) - } - }) + var r big.Int + r.SetString("340444420969191673093399857471996460938405", 10) + samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) + + one := samplePoints[0].X + one.SetOne() + + for i := 1; i < len(samplePoints); i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &one) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) + } } func TestMultiExpG2(t *testing.T) { @@ -527,7 +524,7 @@ func BenchmarkMultiExpG2(b *testing.B) { } func BenchmarkMultiExpG2Reference(b *testing.B) { - const nbSamples = 1 << 23 + const nbSamples = 1 << 20 var ( samplePoints [nbSamples]G2Affine @@ -584,24 +581,26 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG2(samplePoints []G2Affine) { - max := new(big.Int).SetInt64(math.MaxInt64) - parallel.Execute(len(samplePoints), func(start, end int) { - r, _ := rrand.Int(rrand.Reader, max) - samplePoints[start].ScalarMultiplication(&g2GenAff, r) - rr := samplePoints[start].X - rr.SetOne() - for i := start + 1; i < end; i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) - } - }) + var r big.Int + r.SetString("340444420969191673093399857471996460938405", 10) + samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) + + one := samplePoints[0].X + one.SetOne() + + for i := 1; i < len(samplePoints); i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &one) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) + } } func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - parallel.Execute(len(sampleScalars), func(start, end int) { - for i := start; i < end; i++ { - sampleScalars[i].SetRandom() - } - }) + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + for i := 1; i <= len(sampleScalars); i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } } diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index d4e07c80ce..4c40debed6 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -17,9 +17,7 @@ package bw6633 import ( - rrand "crypto/rand" "fmt" - "math" "math/big" "math/bits" "math/rand" @@ -30,7 +28,6 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-633/fr" - "github.com/consensys/gnark-crypto/internal/parallel" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" ) @@ -247,7 +244,7 @@ func BenchmarkMultiExpG1(b *testing.B) { } func BenchmarkMultiExpG1Reference(b *testing.B) { - const nbSamples = 1 << 23 + const nbSamples = 1 << 20 var ( samplePoints [nbSamples]G1Affine @@ -304,17 +301,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG1(samplePoints []G1Affine) { - max := new(big.Int).SetInt64(math.MaxInt64) - parallel.Execute(len(samplePoints), func(start, end int) { - r, _ := rrand.Int(rrand.Reader, max) - samplePoints[start].ScalarMultiplication(&g1GenAff, r) - rr := samplePoints[start].X - rr.SetOne() - for i := start + 1; i < end; i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) - } - }) + var r big.Int + r.SetString("340444420969191673093399857471996460938405", 10) + samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) + + one := samplePoints[0].X + one.SetOne() + + for i := 1; i < len(samplePoints); i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &one) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) + } } func TestMultiExpG2(t *testing.T) { @@ -527,7 +524,7 @@ func BenchmarkMultiExpG2(b *testing.B) { } func BenchmarkMultiExpG2Reference(b *testing.B) { - const nbSamples = 1 << 23 + const nbSamples = 1 << 20 var ( samplePoints [nbSamples]G2Affine @@ -584,24 +581,26 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG2(samplePoints []G2Affine) { - max := new(big.Int).SetInt64(math.MaxInt64) - parallel.Execute(len(samplePoints), func(start, end int) { - r, _ := rrand.Int(rrand.Reader, max) - samplePoints[start].ScalarMultiplication(&g2GenAff, r) - rr := samplePoints[start].X - rr.SetOne() - for i := start + 1; i < end; i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) - } - }) + var r big.Int + r.SetString("340444420969191673093399857471996460938405", 10) + samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) + + one := samplePoints[0].X + one.SetOne() + + for i := 1; i < len(samplePoints); i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &one) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) + } } func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - parallel.Execute(len(sampleScalars), func(start, end int) { - for i := start; i < end; i++ { - sampleScalars[i].SetRandom() - } - }) + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + for i := 1; i <= len(sampleScalars); i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } } diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index 64f9ca10ec..d79044f69c 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -17,9 +17,7 @@ package bw6756 import ( - rrand "crypto/rand" "fmt" - "math" "math/big" "math/bits" "math/rand" @@ -30,7 +28,6 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-756/fr" - "github.com/consensys/gnark-crypto/internal/parallel" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" ) @@ -247,7 +244,7 @@ func BenchmarkMultiExpG1(b *testing.B) { } func BenchmarkMultiExpG1Reference(b *testing.B) { - const nbSamples = 1 << 23 + const nbSamples = 1 << 20 var ( samplePoints [nbSamples]G1Affine @@ -304,17 +301,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG1(samplePoints []G1Affine) { - max := new(big.Int).SetInt64(math.MaxInt64) - parallel.Execute(len(samplePoints), func(start, end int) { - r, _ := rrand.Int(rrand.Reader, max) - samplePoints[start].ScalarMultiplication(&g1GenAff, r) - rr := samplePoints[start].X - rr.SetOne() - for i := start + 1; i < end; i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) - } - }) + var r big.Int + r.SetString("340444420969191673093399857471996460938405", 10) + samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) + + one := samplePoints[0].X + one.SetOne() + + for i := 1; i < len(samplePoints); i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &one) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) + } } func TestMultiExpG2(t *testing.T) { @@ -527,7 +524,7 @@ func BenchmarkMultiExpG2(b *testing.B) { } func BenchmarkMultiExpG2Reference(b *testing.B) { - const nbSamples = 1 << 23 + const nbSamples = 1 << 20 var ( samplePoints [nbSamples]G2Affine @@ -584,24 +581,26 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG2(samplePoints []G2Affine) { - max := new(big.Int).SetInt64(math.MaxInt64) - parallel.Execute(len(samplePoints), func(start, end int) { - r, _ := rrand.Int(rrand.Reader, max) - samplePoints[start].ScalarMultiplication(&g2GenAff, r) - rr := samplePoints[start].X - rr.SetOne() - for i := start + 1; i < end; i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) - } - }) + var r big.Int + r.SetString("340444420969191673093399857471996460938405", 10) + samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) + + one := samplePoints[0].X + one.SetOne() + + for i := 1; i < len(samplePoints); i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &one) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) + } } func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - parallel.Execute(len(sampleScalars), func(start, end int) { - for i := start; i < end; i++ { - sampleScalars[i].SetRandom() - } - }) + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + for i := 1; i <= len(sampleScalars); i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } } diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index 613a4bac53..2dcc22a913 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -17,9 +17,7 @@ package bw6761 import ( - rrand "crypto/rand" "fmt" - "math" "math/big" "math/bits" "math/rand" @@ -30,7 +28,6 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" - "github.com/consensys/gnark-crypto/internal/parallel" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" ) @@ -247,7 +244,7 @@ func BenchmarkMultiExpG1(b *testing.B) { } func BenchmarkMultiExpG1Reference(b *testing.B) { - const nbSamples = 1 << 23 + const nbSamples = 1 << 20 var ( samplePoints [nbSamples]G1Affine @@ -304,17 +301,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG1(samplePoints []G1Affine) { - max := new(big.Int).SetInt64(math.MaxInt64) - parallel.Execute(len(samplePoints), func(start, end int) { - r, _ := rrand.Int(rrand.Reader, max) - samplePoints[start].ScalarMultiplication(&g1GenAff, r) - rr := samplePoints[start].X - rr.SetOne() - for i := start + 1; i < end; i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) - } - }) + var r big.Int + r.SetString("340444420969191673093399857471996460938405", 10) + samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) + + one := samplePoints[0].X + one.SetOne() + + for i := 1; i < len(samplePoints); i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &one) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) + } } func TestMultiExpG2(t *testing.T) { @@ -527,7 +524,7 @@ func BenchmarkMultiExpG2(b *testing.B) { } func BenchmarkMultiExpG2Reference(b *testing.B) { - const nbSamples = 1 << 23 + const nbSamples = 1 << 20 var ( samplePoints [nbSamples]G2Affine @@ -584,24 +581,26 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBasesG2(samplePoints []G2Affine) { - max := new(big.Int).SetInt64(math.MaxInt64) - parallel.Execute(len(samplePoints), func(start, end int) { - r, _ := rrand.Int(rrand.Reader, max) - samplePoints[start].ScalarMultiplication(&g2GenAff, r) - rr := samplePoints[start].X - rr.SetOne() - for i := start + 1; i < end; i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) - } - }) + var r big.Int + r.SetString("340444420969191673093399857471996460938405", 10) + samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) + + one := samplePoints[0].X + one.SetOne() + + for i := 1; i < len(samplePoints); i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &one) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) + } } func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - parallel.Execute(len(sampleScalars), func(start, end int) { - for i := start; i < end; i++ { - sampleScalars[i].SetRandom() - } - }) + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + for i := 1; i <= len(sampleScalars); i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } } diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index e6cd2b0338..070481bf7b 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -12,14 +12,11 @@ import ( "time" "runtime" "math/rand" - rrand "crypto/rand" "math/big" "testing" "math/bits" "sync" - "math" - "github.com/consensys/gnark-crypto/internal/parallel" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr" "github.com/leanovate/gopter" @@ -262,7 +259,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { func BenchmarkMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) { - const nbSamples = 1 << 23 + const nbSamples = 1 << 20 var ( samplePoints [nbSamples]{{ $.TAffine }} @@ -321,26 +318,30 @@ func BenchmarkManyMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) { // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. func fillBenchBases{{ toUpper $.PointName }}(samplePoints []{{ $.TAffine }}) { - max := new(big.Int).SetInt64(math.MaxInt64) - parallel.Execute(len(samplePoints), func(start, end int) { - r, _ := rrand.Int(rrand.Reader, max) - samplePoints[start].ScalarMultiplication(&{{$.PointName}}GenAff, r) - rr := samplePoints[start].X - rr.SetOne() - for i := start+1; i < end; i++ { - samplePoints[i].X.Add(&samplePoints[i-1].X, &rr) - samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr) - } - }) + var r big.Int + r.SetString("340444420969191673093399857471996460938405", 10) + samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) + + one := samplePoints[0].X + one.SetOne() + + for i := 1; i < len(samplePoints); i++ { + samplePoints[i].X.Add(&samplePoints[i-1].X, &one) + samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one) + } } + {{end }} + func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - parallel.Execute(len(sampleScalars), func(start, end int) { - for i := start; i < end; i++ { - sampleScalars[i].SetRandom() - } - }) + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + for i := 1; i <= len(sampleScalars); i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } } From f973cf4bb1fcf6abc7a4ea3db4b12dc584f79c80 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Fri, 11 Nov 2022 16:04:19 +0000 Subject: [PATCH 15/43] fix: fix splitting logic in msm --- ecc/bls12-377/multiexp.go | 12 ++++++------ ecc/bls12-378/multiexp.go | 12 ++++++------ ecc/bls12-381/multiexp.go | 12 ++++++------ ecc/bls24-315/multiexp.go | 12 ++++++------ ecc/bls24-317/multiexp.go | 12 ++++++------ ecc/bn254/multiexp.go | 12 ++++++------ ecc/bw6-633/multiexp.go | 12 ++++++------ ecc/bw6-756/multiexp.go | 12 ++++++------ ecc/bw6-761/multiexp.go | 12 ++++++------ internal/generator/ecc/template/multiexp.go.tmpl | 6 +++--- 10 files changed, 57 insertions(+), 57 deletions(-) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index 3c7e9fa3e4..53a1823e0d 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -116,17 +116,17 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p G1Jac chDone := make(chan struct{}, 1) go func() { - innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) close(chDone) }() - innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) <-chDone p.AddAssign(&_p) return p, nil @@ -364,17 +364,17 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p G2Jac chDone := make(chan struct{}, 1) go func() { - innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) close(chDone) }() - innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) <-chDone p.AddAssign(&_p) return p, nil diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index 2371d32a5c..73e162f80d 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -116,17 +116,17 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p G1Jac chDone := make(chan struct{}, 1) go func() { - innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) close(chDone) }() - innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) <-chDone p.AddAssign(&_p) return p, nil @@ -364,17 +364,17 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p G2Jac chDone := make(chan struct{}, 1) go func() { - innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) close(chDone) }() - innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) <-chDone p.AddAssign(&_p) return p, nil diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index e13ca90588..80ff8bfc30 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -116,17 +116,17 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p G1Jac chDone := make(chan struct{}, 1) go func() { - innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) close(chDone) }() - innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) <-chDone p.AddAssign(&_p) return p, nil @@ -364,17 +364,17 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p G2Jac chDone := make(chan struct{}, 1) go func() { - innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) close(chDone) }() - innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) <-chDone p.AddAssign(&_p) return p, nil diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index d73bb2783e..f61ab96f3d 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -116,17 +116,17 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p G1Jac chDone := make(chan struct{}, 1) go func() { - innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) close(chDone) }() - innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) <-chDone p.AddAssign(&_p) return p, nil @@ -364,17 +364,17 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p G2Jac chDone := make(chan struct{}, 1) go func() { - innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) close(chDone) }() - innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) <-chDone p.AddAssign(&_p) return p, nil diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index ceee16e7fd..8b81840e50 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -116,17 +116,17 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p G1Jac chDone := make(chan struct{}, 1) go func() { - innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) close(chDone) }() - innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) <-chDone p.AddAssign(&_p) return p, nil @@ -364,17 +364,17 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p G2Jac chDone := make(chan struct{}, 1) go func() { - innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) close(chDone) }() - innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) <-chDone p.AddAssign(&_p) return p, nil diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index 1167251103..ac979ddff7 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -116,17 +116,17 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p G1Jac chDone := make(chan struct{}, 1) go func() { - innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) close(chDone) }() - innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) <-chDone p.AddAssign(&_p) return p, nil @@ -364,17 +364,17 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p G2Jac chDone := make(chan struct{}, 1) go func() { - innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) close(chDone) }() - innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) <-chDone p.AddAssign(&_p) return p, nil diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index 3ca13bafc4..23c35d3d90 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -116,17 +116,17 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p G1Jac chDone := make(chan struct{}, 1) go func() { - innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) close(chDone) }() - innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) <-chDone p.AddAssign(&_p) return p, nil @@ -327,17 +327,17 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p G2Jac chDone := make(chan struct{}, 1) go func() { - innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) close(chDone) }() - innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) <-chDone p.AddAssign(&_p) return p, nil diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index 87598422a7..0124b603f4 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -116,17 +116,17 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p G1Jac chDone := make(chan struct{}, 1) go func() { - innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) close(chDone) }() - innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) <-chDone p.AddAssign(&_p) return p, nil @@ -328,17 +328,17 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p G2Jac chDone := make(chan struct{}, 1) go func() { - innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) close(chDone) }() - innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) <-chDone p.AddAssign(&_p) return p, nil diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index d5165db4a5..fc2c7c4908 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -116,17 +116,17 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p G1Jac chDone := make(chan struct{}, 1) go func() { - innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) close(chDone) }() - innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) <-chDone p.AddAssign(&_p) return p, nil @@ -328,17 +328,17 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 - if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) { + if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p G2Jac chDone := make(chan struct{}, 1) go func() { - innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) close(chDone) }() - innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) <-chDone p.AddAssign(&_p) return p, nil diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index 67ed6b9027..5940c0e8c5 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -385,17 +385,17 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit*2 - if (nbTasksPostSplit <= config.NbTasks) || ( nbTasksPostSplit - config.NbTasks ) <= ( config.NbTasks - nbChunks) { + if (nbTasksPostSplit <= config.NbTasks /2 ) || ( nbTasksPostSplit - config.NbTasks/2 ) <= ( config.NbTasks - nbChunks) { // if postSplit we still have less tasks than available CPU // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p {{ $.TJacobian }} chDone := make(chan struct{}, 1) go func() { - innerMsm{{ $.UPointName }}(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config) + _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) close(chDone) }() - innerMsm{{ $.UPointName }}(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config) + p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) <-chDone p.AddAssign(&_p) return p, nil From 85e6ea0cbb23c22ee1b6499b2ce37444c9f50005 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Mon, 14 Nov 2022 11:52:42 -0600 Subject: [PATCH 16/43] feat: store neg(P) and P in opposite sides of batch add input slice --- ecc/bls12-377/g1.go | 65 +++++++---- ecc/bls12-377/g1_test.go | 5 +- ecc/bls12-377/g2.go | 65 +++++++---- ecc/bls12-377/g2_test.go | 5 +- ecc/bls12-377/multiexp_affine.go | 106 +++++++++++------- ecc/bls12-378/g1.go | 65 +++++++---- ecc/bls12-378/g1_test.go | 5 +- ecc/bls12-378/g2.go | 65 +++++++---- ecc/bls12-378/g2_test.go | 5 +- ecc/bls12-378/multiexp_affine.go | 106 +++++++++++------- ecc/bls12-381/g1.go | 65 +++++++---- ecc/bls12-381/g1_test.go | 5 +- ecc/bls12-381/g2.go | 65 +++++++---- ecc/bls12-381/g2_test.go | 5 +- ecc/bls12-381/multiexp_affine.go | 106 +++++++++++------- ecc/bls24-315/g1.go | 65 +++++++---- ecc/bls24-315/g1_test.go | 5 +- ecc/bls24-315/g2.go | 65 +++++++---- ecc/bls24-315/g2_test.go | 5 +- ecc/bls24-315/multiexp_affine.go | 106 +++++++++++------- ecc/bls24-317/g1.go | 65 +++++++---- ecc/bls24-317/g1_test.go | 5 +- ecc/bls24-317/g2.go | 65 +++++++---- ecc/bls24-317/g2_test.go | 5 +- ecc/bls24-317/multiexp_affine.go | 106 +++++++++++------- ecc/bn254/g1.go | 65 +++++++---- ecc/bn254/g1_test.go | 5 +- ecc/bn254/g2.go | 65 +++++++---- ecc/bn254/g2_test.go | 5 +- ecc/bn254/multiexp_affine.go | 106 +++++++++++------- ecc/bw6-633/g1.go | 65 +++++++---- ecc/bw6-633/g1_test.go | 5 +- ecc/bw6-633/g2.go | 65 +++++++---- ecc/bw6-633/g2_test.go | 5 +- ecc/bw6-633/multiexp_affine.go | 106 +++++++++++------- ecc/bw6-756/g1.go | 65 +++++++---- ecc/bw6-756/g1_test.go | 5 +- ecc/bw6-756/g2.go | 65 +++++++---- ecc/bw6-756/g2_test.go | 5 +- ecc/bw6-756/multiexp_affine.go | 106 +++++++++++------- ecc/bw6-761/g1.go | 65 +++++++---- ecc/bw6-761/g1_test.go | 5 +- ecc/bw6-761/g2.go | 65 +++++++---- ecc/bw6-761/g2_test.go | 5 +- ecc/bw6-761/multiexp_affine.go | 106 +++++++++++------- .../ecc/template/multiexp_affine.go.tmpl | 54 +++++---- internal/generator/ecc/template/point.go.tmpl | 65 +++++++---- .../ecc/template/tests/point.go.tmpl | 5 +- 48 files changed, 1502 insertions(+), 836 deletions(-) diff --git a/ecc/bls12-377/g1.go b/ecc/bls12-377/g1.go index 962a527a2e..3602b14992 100644 --- a/ecc/bls12-377/g1.go +++ b/ecc/bls12-377/g1.go @@ -980,25 +980,29 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin return toReturnAff } -// batch add/dbl in affine coordinates +// batch add/sub in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { - batchSize := len(R) +// len(R) == len(P) == N +// R[:cptAdd], P[:cptAdd] contains points references to ADD +// R[N-cptSub:], P[N-cptSub] contains points references to SUB +// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { + batchSize := cptAdd + cptSub if batchSize == 0 { return } - var isDbl [MAX_BATCH_SIZE]bool var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + j := 0 + // add part + for j = 0; j < cptAdd; j++ { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + // sub part + for i := len(R) - cptSub; i < len(R); i++ { + lambdain[j].Sub(&P[i].X, &R[i].X) + j++ } // invert denominator @@ -1007,17 +1011,11 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { var d fp.Element var rr G1Affine - for j := 0; j < batchSize; j++ { - // computa lambda, distinguishing dbl / add - if isDbl[j] { - d.Square(&P[j].X) - lambda[j].Mul(&lambda[j], &d) - d.Double(&lambda[j]) - lambda[j].Add(&lambda[j], &d) - } else { - d.Sub(&P[j].Y, &R[j].Y) - lambda[j].Mul(&lambda[j], &d) - } + // add part + for j := 0; j < cptAdd; j++ { + // computa lambda + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) @@ -1028,6 +1026,27 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } + + // middle of the input may be ignored if cptAdd + cptSub != len(R) + offset := len(R) - batchSize + + // sub part + for j := cptAdd; j < batchSize; j++ { + // computa lambda + idx := j + offset + d.Neg(&P[idx].Y) + d.Sub(&d, &R[idx].Y) + lambda[j].Mul(&lambda[j], &d) + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[idx].X) + rr.X.Sub(&rr.X, &P[idx].X) + d.Sub(&R[idx].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[idx].Y) + R[idx].Set(&rr) + } } // batch inversion diff --git a/ecc/bls12-377/g1_test.go b/ecc/bls12-377/g1_test.go index afb23458b1..105a2d0a9a 100644 --- a/ecc/bls12-377/g1_test.go +++ b/ecc/bls12-377/g1_test.go @@ -502,7 +502,7 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { func BenchmarkBatchAddG1Affine(b *testing.B) { var P, R [MAX_BATCH_SIZE]G1Affine - var RR [MAX_BATCH_SIZE]*G1Affine + var RR, PP [MAX_BATCH_SIZE]*G1Affine var ridx [MAX_BATCH_SIZE]int fillBenchBasesG1(P[:]) @@ -517,11 +517,12 @@ func BenchmarkBatchAddG1Affine(b *testing.B) { for i, ri := range ridx { RR[i] = &R[ri] + PP[i] = &P[ri] } b.ResetTimer() for i := 0; i < b.N; i++ { - BatchAddG1Affine(RR[:], P[:]) + batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) } } diff --git a/ecc/bls12-377/g2.go b/ecc/bls12-377/g2.go index dd09808a13..04c0f5fac5 100644 --- a/ecc/bls12-377/g2.go +++ b/ecc/bls12-377/g2.go @@ -976,25 +976,29 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin return toReturn } -// batch add/dbl in affine coordinates +// batch add/sub in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { - batchSize := len(R) +// len(R) == len(P) == N +// R[:cptAdd], P[:cptAdd] contains points references to ADD +// R[N-cptSub:], P[N-cptSub] contains points references to SUB +// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { + batchSize := cptAdd + cptSub if batchSize == 0 { return } - var isDbl [MAX_BATCH_SIZE]bool var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2 - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + j := 0 + // add part + for j = 0; j < cptAdd; j++ { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + // sub part + for i := len(R) - cptSub; i < len(R); i++ { + lambdain[j].Sub(&P[i].X, &R[i].X) + j++ } // invert denominator @@ -1003,17 +1007,11 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { var d fptower.E2 var rr G2Affine - for j := 0; j < batchSize; j++ { - // computa lambda, distinguishing dbl / add - if isDbl[j] { - d.Square(&P[j].X) - lambda[j].Mul(&lambda[j], &d) - d.Double(&lambda[j]) - lambda[j].Add(&lambda[j], &d) - } else { - d.Sub(&P[j].Y, &R[j].Y) - lambda[j].Mul(&lambda[j], &d) - } + // add part + for j := 0; j < cptAdd; j++ { + // computa lambda + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) @@ -1024,6 +1022,27 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } + + // middle of the input may be ignored if cptAdd + cptSub != len(R) + offset := len(R) - batchSize + + // sub part + for j := cptAdd; j < batchSize; j++ { + // computa lambda + idx := j + offset + d.Neg(&P[idx].Y) + d.Sub(&d, &R[idx].Y) + lambda[j].Mul(&lambda[j], &d) + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[idx].X) + rr.X.Sub(&rr.X, &P[idx].X) + d.Sub(&R[idx].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[idx].Y) + R[idx].Set(&rr) + } } // batch inversion diff --git a/ecc/bls12-377/g2_test.go b/ecc/bls12-377/g2_test.go index 52e3ff41c1..9048e32439 100644 --- a/ecc/bls12-377/g2_test.go +++ b/ecc/bls12-377/g2_test.go @@ -508,7 +508,7 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { func BenchmarkBatchAddG2Affine(b *testing.B) { var P, R [MAX_BATCH_SIZE]G2Affine - var RR [MAX_BATCH_SIZE]*G2Affine + var RR, PP [MAX_BATCH_SIZE]*G2Affine var ridx [MAX_BATCH_SIZE]int fillBenchBasesG2(P[:]) @@ -523,11 +523,12 @@ func BenchmarkBatchAddG2Affine(b *testing.B) { for i, ri := range ridx { RR[i] = &R[ri] + PP[i] = &P[ri] } b.ResetTimer() for i := 0; i < b.N; i++ { - BatchAddG2Affine(RR[:], P[:]) + batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) } } diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index 33cbf3844b..c8b2686337 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -55,9 +55,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, batchSize = 1 } var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptP := 0 // count the number of point added to current batch + cptAdd := 0 // count the number of bucket + point added to current batch + cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { @@ -65,17 +66,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return cptP == batchSize + return (cptAdd + cptSub) == batchSize } executeAndReset := func() { - if cptP == 0 { + if (cptAdd + cptSub) == 0 { return } - BatchAddG1Affine(R[:cptP], P[:cptP]) + batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + var tmp BS bucketIds = tmp - cptP = 0 + cptAdd = 0 + cptSub = 0 } add := func(op batchOp) { @@ -95,28 +98,40 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } return } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(PP) { - BK.setInfinity() + if BK.X.Equal(&PP.X) { + if BK.Y.Equal(&PP.Y) { + if op.isNeg() { + // P + -P + BK.setInfinity() + return + } + // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { - BK.setInfinity() + // b.Y == -p.Y + if op.isNeg() { + // doubling . + BK.Add(BK, BK) return } + BK.setInfinity() + return } - bucketIds[op.bucketID] = true //struct{}{} - R[cptP] = BK + bucketIds[op.bucketID] = true if op.isNeg() { - P[cptP].Neg(PP) + cptSub++ + R[batchSize-cptSub] = BK + P[batchSize-cptSub] = PP } else { - P[cptP].Set(PP) + R[cptAdd] = BK + P[cptAdd] = PP + cptAdd++ } - cptP++ + } var queue [MAX_BATCH_SIZE]batchOp @@ -268,9 +283,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, batchSize = 1 } var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptP := 0 // count the number of point added to current batch + cptAdd := 0 // count the number of bucket + point added to current batch + cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { @@ -278,17 +294,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return cptP == batchSize + return (cptAdd + cptSub) == batchSize } executeAndReset := func() { - if cptP == 0 { + if (cptAdd + cptSub) == 0 { return } - BatchAddG2Affine(R[:cptP], P[:cptP]) + batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + var tmp BS bucketIds = tmp - cptP = 0 + cptAdd = 0 + cptSub = 0 } add := func(op batchOp) { @@ -308,28 +326,40 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } return } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(PP) { - BK.setInfinity() + if BK.X.Equal(&PP.X) { + if BK.Y.Equal(&PP.Y) { + if op.isNeg() { + // P + -P + BK.setInfinity() + return + } + // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { - BK.setInfinity() + // b.Y == -p.Y + if op.isNeg() { + // doubling . + BK.Add(BK, BK) return } + BK.setInfinity() + return } - bucketIds[op.bucketID] = true //struct{}{} - R[cptP] = BK + bucketIds[op.bucketID] = true if op.isNeg() { - P[cptP].Neg(PP) + cptSub++ + R[batchSize-cptSub] = BK + P[batchSize-cptSub] = PP } else { - P[cptP].Set(PP) + R[cptAdd] = BK + P[cptAdd] = PP + cptAdd++ } - cptP++ + } var queue [MAX_BATCH_SIZE]batchOp diff --git a/ecc/bls12-378/g1.go b/ecc/bls12-378/g1.go index 8422e95efb..d36be41445 100644 --- a/ecc/bls12-378/g1.go +++ b/ecc/bls12-378/g1.go @@ -980,25 +980,29 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin return toReturnAff } -// batch add/dbl in affine coordinates +// batch add/sub in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { - batchSize := len(R) +// len(R) == len(P) == N +// R[:cptAdd], P[:cptAdd] contains points references to ADD +// R[N-cptSub:], P[N-cptSub] contains points references to SUB +// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { + batchSize := cptAdd + cptSub if batchSize == 0 { return } - var isDbl [MAX_BATCH_SIZE]bool var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + j := 0 + // add part + for j = 0; j < cptAdd; j++ { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + // sub part + for i := len(R) - cptSub; i < len(R); i++ { + lambdain[j].Sub(&P[i].X, &R[i].X) + j++ } // invert denominator @@ -1007,17 +1011,11 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { var d fp.Element var rr G1Affine - for j := 0; j < batchSize; j++ { - // computa lambda, distinguishing dbl / add - if isDbl[j] { - d.Square(&P[j].X) - lambda[j].Mul(&lambda[j], &d) - d.Double(&lambda[j]) - lambda[j].Add(&lambda[j], &d) - } else { - d.Sub(&P[j].Y, &R[j].Y) - lambda[j].Mul(&lambda[j], &d) - } + // add part + for j := 0; j < cptAdd; j++ { + // computa lambda + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) @@ -1028,6 +1026,27 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } + + // middle of the input may be ignored if cptAdd + cptSub != len(R) + offset := len(R) - batchSize + + // sub part + for j := cptAdd; j < batchSize; j++ { + // computa lambda + idx := j + offset + d.Neg(&P[idx].Y) + d.Sub(&d, &R[idx].Y) + lambda[j].Mul(&lambda[j], &d) + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[idx].X) + rr.X.Sub(&rr.X, &P[idx].X) + d.Sub(&R[idx].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[idx].Y) + R[idx].Set(&rr) + } } // batch inversion diff --git a/ecc/bls12-378/g1_test.go b/ecc/bls12-378/g1_test.go index 3859bb2695..a3603c49cb 100644 --- a/ecc/bls12-378/g1_test.go +++ b/ecc/bls12-378/g1_test.go @@ -502,7 +502,7 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { func BenchmarkBatchAddG1Affine(b *testing.B) { var P, R [MAX_BATCH_SIZE]G1Affine - var RR [MAX_BATCH_SIZE]*G1Affine + var RR, PP [MAX_BATCH_SIZE]*G1Affine var ridx [MAX_BATCH_SIZE]int fillBenchBasesG1(P[:]) @@ -517,11 +517,12 @@ func BenchmarkBatchAddG1Affine(b *testing.B) { for i, ri := range ridx { RR[i] = &R[ri] + PP[i] = &P[ri] } b.ResetTimer() for i := 0; i < b.N; i++ { - BatchAddG1Affine(RR[:], P[:]) + batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) } } diff --git a/ecc/bls12-378/g2.go b/ecc/bls12-378/g2.go index 9cca73e6b3..9803f61512 100644 --- a/ecc/bls12-378/g2.go +++ b/ecc/bls12-378/g2.go @@ -976,25 +976,29 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin return toReturn } -// batch add/dbl in affine coordinates +// batch add/sub in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { - batchSize := len(R) +// len(R) == len(P) == N +// R[:cptAdd], P[:cptAdd] contains points references to ADD +// R[N-cptSub:], P[N-cptSub] contains points references to SUB +// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { + batchSize := cptAdd + cptSub if batchSize == 0 { return } - var isDbl [MAX_BATCH_SIZE]bool var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2 - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + j := 0 + // add part + for j = 0; j < cptAdd; j++ { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + // sub part + for i := len(R) - cptSub; i < len(R); i++ { + lambdain[j].Sub(&P[i].X, &R[i].X) + j++ } // invert denominator @@ -1003,17 +1007,11 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { var d fptower.E2 var rr G2Affine - for j := 0; j < batchSize; j++ { - // computa lambda, distinguishing dbl / add - if isDbl[j] { - d.Square(&P[j].X) - lambda[j].Mul(&lambda[j], &d) - d.Double(&lambda[j]) - lambda[j].Add(&lambda[j], &d) - } else { - d.Sub(&P[j].Y, &R[j].Y) - lambda[j].Mul(&lambda[j], &d) - } + // add part + for j := 0; j < cptAdd; j++ { + // computa lambda + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) @@ -1024,6 +1022,27 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } + + // middle of the input may be ignored if cptAdd + cptSub != len(R) + offset := len(R) - batchSize + + // sub part + for j := cptAdd; j < batchSize; j++ { + // computa lambda + idx := j + offset + d.Neg(&P[idx].Y) + d.Sub(&d, &R[idx].Y) + lambda[j].Mul(&lambda[j], &d) + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[idx].X) + rr.X.Sub(&rr.X, &P[idx].X) + d.Sub(&R[idx].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[idx].Y) + R[idx].Set(&rr) + } } // batch inversion diff --git a/ecc/bls12-378/g2_test.go b/ecc/bls12-378/g2_test.go index f81d14069b..ffe94dbc2e 100644 --- a/ecc/bls12-378/g2_test.go +++ b/ecc/bls12-378/g2_test.go @@ -508,7 +508,7 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { func BenchmarkBatchAddG2Affine(b *testing.B) { var P, R [MAX_BATCH_SIZE]G2Affine - var RR [MAX_BATCH_SIZE]*G2Affine + var RR, PP [MAX_BATCH_SIZE]*G2Affine var ridx [MAX_BATCH_SIZE]int fillBenchBasesG2(P[:]) @@ -523,11 +523,12 @@ func BenchmarkBatchAddG2Affine(b *testing.B) { for i, ri := range ridx { RR[i] = &R[ri] + PP[i] = &P[ri] } b.ResetTimer() for i := 0; i < b.N; i++ { - BatchAddG2Affine(RR[:], P[:]) + batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) } } diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index b631d13a72..b30717ffea 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -55,9 +55,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, batchSize = 1 } var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptP := 0 // count the number of point added to current batch + cptAdd := 0 // count the number of bucket + point added to current batch + cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { @@ -65,17 +66,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return cptP == batchSize + return (cptAdd + cptSub) == batchSize } executeAndReset := func() { - if cptP == 0 { + if (cptAdd + cptSub) == 0 { return } - BatchAddG1Affine(R[:cptP], P[:cptP]) + batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + var tmp BS bucketIds = tmp - cptP = 0 + cptAdd = 0 + cptSub = 0 } add := func(op batchOp) { @@ -95,28 +98,40 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } return } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(PP) { - BK.setInfinity() + if BK.X.Equal(&PP.X) { + if BK.Y.Equal(&PP.Y) { + if op.isNeg() { + // P + -P + BK.setInfinity() + return + } + // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { - BK.setInfinity() + // b.Y == -p.Y + if op.isNeg() { + // doubling . + BK.Add(BK, BK) return } + BK.setInfinity() + return } - bucketIds[op.bucketID] = true //struct{}{} - R[cptP] = BK + bucketIds[op.bucketID] = true if op.isNeg() { - P[cptP].Neg(PP) + cptSub++ + R[batchSize-cptSub] = BK + P[batchSize-cptSub] = PP } else { - P[cptP].Set(PP) + R[cptAdd] = BK + P[cptAdd] = PP + cptAdd++ } - cptP++ + } var queue [MAX_BATCH_SIZE]batchOp @@ -268,9 +283,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, batchSize = 1 } var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptP := 0 // count the number of point added to current batch + cptAdd := 0 // count the number of bucket + point added to current batch + cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { @@ -278,17 +294,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return cptP == batchSize + return (cptAdd + cptSub) == batchSize } executeAndReset := func() { - if cptP == 0 { + if (cptAdd + cptSub) == 0 { return } - BatchAddG2Affine(R[:cptP], P[:cptP]) + batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + var tmp BS bucketIds = tmp - cptP = 0 + cptAdd = 0 + cptSub = 0 } add := func(op batchOp) { @@ -308,28 +326,40 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } return } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(PP) { - BK.setInfinity() + if BK.X.Equal(&PP.X) { + if BK.Y.Equal(&PP.Y) { + if op.isNeg() { + // P + -P + BK.setInfinity() + return + } + // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { - BK.setInfinity() + // b.Y == -p.Y + if op.isNeg() { + // doubling . + BK.Add(BK, BK) return } + BK.setInfinity() + return } - bucketIds[op.bucketID] = true //struct{}{} - R[cptP] = BK + bucketIds[op.bucketID] = true if op.isNeg() { - P[cptP].Neg(PP) + cptSub++ + R[batchSize-cptSub] = BK + P[batchSize-cptSub] = PP } else { - P[cptP].Set(PP) + R[cptAdd] = BK + P[cptAdd] = PP + cptAdd++ } - cptP++ + } var queue [MAX_BATCH_SIZE]batchOp diff --git a/ecc/bls12-381/g1.go b/ecc/bls12-381/g1.go index bb37dacb65..b05d04acc8 100644 --- a/ecc/bls12-381/g1.go +++ b/ecc/bls12-381/g1.go @@ -980,25 +980,29 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin return toReturnAff } -// batch add/dbl in affine coordinates +// batch add/sub in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { - batchSize := len(R) +// len(R) == len(P) == N +// R[:cptAdd], P[:cptAdd] contains points references to ADD +// R[N-cptSub:], P[N-cptSub] contains points references to SUB +// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { + batchSize := cptAdd + cptSub if batchSize == 0 { return } - var isDbl [MAX_BATCH_SIZE]bool var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + j := 0 + // add part + for j = 0; j < cptAdd; j++ { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + // sub part + for i := len(R) - cptSub; i < len(R); i++ { + lambdain[j].Sub(&P[i].X, &R[i].X) + j++ } // invert denominator @@ -1007,17 +1011,11 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { var d fp.Element var rr G1Affine - for j := 0; j < batchSize; j++ { - // computa lambda, distinguishing dbl / add - if isDbl[j] { - d.Square(&P[j].X) - lambda[j].Mul(&lambda[j], &d) - d.Double(&lambda[j]) - lambda[j].Add(&lambda[j], &d) - } else { - d.Sub(&P[j].Y, &R[j].Y) - lambda[j].Mul(&lambda[j], &d) - } + // add part + for j := 0; j < cptAdd; j++ { + // computa lambda + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) @@ -1028,6 +1026,27 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } + + // middle of the input may be ignored if cptAdd + cptSub != len(R) + offset := len(R) - batchSize + + // sub part + for j := cptAdd; j < batchSize; j++ { + // computa lambda + idx := j + offset + d.Neg(&P[idx].Y) + d.Sub(&d, &R[idx].Y) + lambda[j].Mul(&lambda[j], &d) + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[idx].X) + rr.X.Sub(&rr.X, &P[idx].X) + d.Sub(&R[idx].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[idx].Y) + R[idx].Set(&rr) + } } // batch inversion diff --git a/ecc/bls12-381/g1_test.go b/ecc/bls12-381/g1_test.go index ee4ce9fb21..68a84cf073 100644 --- a/ecc/bls12-381/g1_test.go +++ b/ecc/bls12-381/g1_test.go @@ -502,7 +502,7 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { func BenchmarkBatchAddG1Affine(b *testing.B) { var P, R [MAX_BATCH_SIZE]G1Affine - var RR [MAX_BATCH_SIZE]*G1Affine + var RR, PP [MAX_BATCH_SIZE]*G1Affine var ridx [MAX_BATCH_SIZE]int fillBenchBasesG1(P[:]) @@ -517,11 +517,12 @@ func BenchmarkBatchAddG1Affine(b *testing.B) { for i, ri := range ridx { RR[i] = &R[ri] + PP[i] = &P[ri] } b.ResetTimer() for i := 0; i < b.N; i++ { - BatchAddG1Affine(RR[:], P[:]) + batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) } } diff --git a/ecc/bls12-381/g2.go b/ecc/bls12-381/g2.go index 86ce9db5b6..c69c7f0444 100644 --- a/ecc/bls12-381/g2.go +++ b/ecc/bls12-381/g2.go @@ -977,25 +977,29 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin return toReturn } -// batch add/dbl in affine coordinates +// batch add/sub in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { - batchSize := len(R) +// len(R) == len(P) == N +// R[:cptAdd], P[:cptAdd] contains points references to ADD +// R[N-cptSub:], P[N-cptSub] contains points references to SUB +// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { + batchSize := cptAdd + cptSub if batchSize == 0 { return } - var isDbl [MAX_BATCH_SIZE]bool var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2 - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + j := 0 + // add part + for j = 0; j < cptAdd; j++ { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + // sub part + for i := len(R) - cptSub; i < len(R); i++ { + lambdain[j].Sub(&P[i].X, &R[i].X) + j++ } // invert denominator @@ -1004,17 +1008,11 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { var d fptower.E2 var rr G2Affine - for j := 0; j < batchSize; j++ { - // computa lambda, distinguishing dbl / add - if isDbl[j] { - d.Square(&P[j].X) - lambda[j].Mul(&lambda[j], &d) - d.Double(&lambda[j]) - lambda[j].Add(&lambda[j], &d) - } else { - d.Sub(&P[j].Y, &R[j].Y) - lambda[j].Mul(&lambda[j], &d) - } + // add part + for j := 0; j < cptAdd; j++ { + // computa lambda + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) @@ -1025,6 +1023,27 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } + + // middle of the input may be ignored if cptAdd + cptSub != len(R) + offset := len(R) - batchSize + + // sub part + for j := cptAdd; j < batchSize; j++ { + // computa lambda + idx := j + offset + d.Neg(&P[idx].Y) + d.Sub(&d, &R[idx].Y) + lambda[j].Mul(&lambda[j], &d) + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[idx].X) + rr.X.Sub(&rr.X, &P[idx].X) + d.Sub(&R[idx].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[idx].Y) + R[idx].Set(&rr) + } } // batch inversion diff --git a/ecc/bls12-381/g2_test.go b/ecc/bls12-381/g2_test.go index a243b65b01..129a541689 100644 --- a/ecc/bls12-381/g2_test.go +++ b/ecc/bls12-381/g2_test.go @@ -508,7 +508,7 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { func BenchmarkBatchAddG2Affine(b *testing.B) { var P, R [MAX_BATCH_SIZE]G2Affine - var RR [MAX_BATCH_SIZE]*G2Affine + var RR, PP [MAX_BATCH_SIZE]*G2Affine var ridx [MAX_BATCH_SIZE]int fillBenchBasesG2(P[:]) @@ -523,11 +523,12 @@ func BenchmarkBatchAddG2Affine(b *testing.B) { for i, ri := range ridx { RR[i] = &R[ri] + PP[i] = &P[ri] } b.ResetTimer() for i := 0; i < b.N; i++ { - BatchAddG2Affine(RR[:], P[:]) + batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) } } diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index bf65dc9aa1..a2a7eb8ffb 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -55,9 +55,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, batchSize = 1 } var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptP := 0 // count the number of point added to current batch + cptAdd := 0 // count the number of bucket + point added to current batch + cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { @@ -65,17 +66,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return cptP == batchSize + return (cptAdd + cptSub) == batchSize } executeAndReset := func() { - if cptP == 0 { + if (cptAdd + cptSub) == 0 { return } - BatchAddG1Affine(R[:cptP], P[:cptP]) + batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + var tmp BS bucketIds = tmp - cptP = 0 + cptAdd = 0 + cptSub = 0 } add := func(op batchOp) { @@ -95,28 +98,40 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } return } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(PP) { - BK.setInfinity() + if BK.X.Equal(&PP.X) { + if BK.Y.Equal(&PP.Y) { + if op.isNeg() { + // P + -P + BK.setInfinity() + return + } + // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { - BK.setInfinity() + // b.Y == -p.Y + if op.isNeg() { + // doubling . + BK.Add(BK, BK) return } + BK.setInfinity() + return } - bucketIds[op.bucketID] = true //struct{}{} - R[cptP] = BK + bucketIds[op.bucketID] = true if op.isNeg() { - P[cptP].Neg(PP) + cptSub++ + R[batchSize-cptSub] = BK + P[batchSize-cptSub] = PP } else { - P[cptP].Set(PP) + R[cptAdd] = BK + P[cptAdd] = PP + cptAdd++ } - cptP++ + } var queue [MAX_BATCH_SIZE]batchOp @@ -268,9 +283,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, batchSize = 1 } var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptP := 0 // count the number of point added to current batch + cptAdd := 0 // count the number of bucket + point added to current batch + cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { @@ -278,17 +294,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return cptP == batchSize + return (cptAdd + cptSub) == batchSize } executeAndReset := func() { - if cptP == 0 { + if (cptAdd + cptSub) == 0 { return } - BatchAddG2Affine(R[:cptP], P[:cptP]) + batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + var tmp BS bucketIds = tmp - cptP = 0 + cptAdd = 0 + cptSub = 0 } add := func(op batchOp) { @@ -308,28 +326,40 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } return } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(PP) { - BK.setInfinity() + if BK.X.Equal(&PP.X) { + if BK.Y.Equal(&PP.Y) { + if op.isNeg() { + // P + -P + BK.setInfinity() + return + } + // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { - BK.setInfinity() + // b.Y == -p.Y + if op.isNeg() { + // doubling . + BK.Add(BK, BK) return } + BK.setInfinity() + return } - bucketIds[op.bucketID] = true //struct{}{} - R[cptP] = BK + bucketIds[op.bucketID] = true if op.isNeg() { - P[cptP].Neg(PP) + cptSub++ + R[batchSize-cptSub] = BK + P[batchSize-cptSub] = PP } else { - P[cptP].Set(PP) + R[cptAdd] = BK + P[cptAdd] = PP + cptAdd++ } - cptP++ + } var queue [MAX_BATCH_SIZE]batchOp diff --git a/ecc/bls24-315/g1.go b/ecc/bls24-315/g1.go index e55d4ad4cb..86e394a710 100644 --- a/ecc/bls24-315/g1.go +++ b/ecc/bls24-315/g1.go @@ -982,25 +982,29 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin return toReturnAff } -// batch add/dbl in affine coordinates +// batch add/sub in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { - batchSize := len(R) +// len(R) == len(P) == N +// R[:cptAdd], P[:cptAdd] contains points references to ADD +// R[N-cptSub:], P[N-cptSub] contains points references to SUB +// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { + batchSize := cptAdd + cptSub if batchSize == 0 { return } - var isDbl [MAX_BATCH_SIZE]bool var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + j := 0 + // add part + for j = 0; j < cptAdd; j++ { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + // sub part + for i := len(R) - cptSub; i < len(R); i++ { + lambdain[j].Sub(&P[i].X, &R[i].X) + j++ } // invert denominator @@ -1009,17 +1013,11 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { var d fp.Element var rr G1Affine - for j := 0; j < batchSize; j++ { - // computa lambda, distinguishing dbl / add - if isDbl[j] { - d.Square(&P[j].X) - lambda[j].Mul(&lambda[j], &d) - d.Double(&lambda[j]) - lambda[j].Add(&lambda[j], &d) - } else { - d.Sub(&P[j].Y, &R[j].Y) - lambda[j].Mul(&lambda[j], &d) - } + // add part + for j := 0; j < cptAdd; j++ { + // computa lambda + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) @@ -1030,6 +1028,27 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } + + // middle of the input may be ignored if cptAdd + cptSub != len(R) + offset := len(R) - batchSize + + // sub part + for j := cptAdd; j < batchSize; j++ { + // computa lambda + idx := j + offset + d.Neg(&P[idx].Y) + d.Sub(&d, &R[idx].Y) + lambda[j].Mul(&lambda[j], &d) + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[idx].X) + rr.X.Sub(&rr.X, &P[idx].X) + d.Sub(&R[idx].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[idx].Y) + R[idx].Set(&rr) + } } // batch inversion diff --git a/ecc/bls24-315/g1_test.go b/ecc/bls24-315/g1_test.go index d1061a803e..d3840c2537 100644 --- a/ecc/bls24-315/g1_test.go +++ b/ecc/bls24-315/g1_test.go @@ -502,7 +502,7 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { func BenchmarkBatchAddG1Affine(b *testing.B) { var P, R [MAX_BATCH_SIZE]G1Affine - var RR [MAX_BATCH_SIZE]*G1Affine + var RR, PP [MAX_BATCH_SIZE]*G1Affine var ridx [MAX_BATCH_SIZE]int fillBenchBasesG1(P[:]) @@ -517,11 +517,12 @@ func BenchmarkBatchAddG1Affine(b *testing.B) { for i, ri := range ridx { RR[i] = &R[ri] + PP[i] = &P[ri] } b.ResetTimer() for i := 0; i < b.N; i++ { - BatchAddG1Affine(RR[:], P[:]) + batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) } } diff --git a/ecc/bls24-315/g2.go b/ecc/bls24-315/g2.go index f5dffd0752..6170c188b6 100644 --- a/ecc/bls24-315/g2.go +++ b/ecc/bls24-315/g2.go @@ -992,25 +992,29 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin return toReturn } -// batch add/dbl in affine coordinates +// batch add/sub in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { - batchSize := len(R) +// len(R) == len(P) == N +// R[:cptAdd], P[:cptAdd] contains points references to ADD +// R[N-cptSub:], P[N-cptSub] contains points references to SUB +// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { + batchSize := cptAdd + cptSub if batchSize == 0 { return } - var isDbl [MAX_BATCH_SIZE]bool var lambda, lambdain [MAX_BATCH_SIZE]fptower.E4 - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + j := 0 + // add part + for j = 0; j < cptAdd; j++ { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + // sub part + for i := len(R) - cptSub; i < len(R); i++ { + lambdain[j].Sub(&P[i].X, &R[i].X) + j++ } // invert denominator @@ -1019,17 +1023,11 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { var d fptower.E4 var rr G2Affine - for j := 0; j < batchSize; j++ { - // computa lambda, distinguishing dbl / add - if isDbl[j] { - d.Square(&P[j].X) - lambda[j].Mul(&lambda[j], &d) - d.Double(&lambda[j]) - lambda[j].Add(&lambda[j], &d) - } else { - d.Sub(&P[j].Y, &R[j].Y) - lambda[j].Mul(&lambda[j], &d) - } + // add part + for j := 0; j < cptAdd; j++ { + // computa lambda + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) @@ -1040,6 +1038,27 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } + + // middle of the input may be ignored if cptAdd + cptSub != len(R) + offset := len(R) - batchSize + + // sub part + for j := cptAdd; j < batchSize; j++ { + // computa lambda + idx := j + offset + d.Neg(&P[idx].Y) + d.Sub(&d, &R[idx].Y) + lambda[j].Mul(&lambda[j], &d) + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[idx].X) + rr.X.Sub(&rr.X, &P[idx].X) + d.Sub(&R[idx].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[idx].Y) + R[idx].Set(&rr) + } } // batch inversion diff --git a/ecc/bls24-315/g2_test.go b/ecc/bls24-315/g2_test.go index ccdac4012c..2e97c208c0 100644 --- a/ecc/bls24-315/g2_test.go +++ b/ecc/bls24-315/g2_test.go @@ -508,7 +508,7 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { func BenchmarkBatchAddG2Affine(b *testing.B) { var P, R [MAX_BATCH_SIZE]G2Affine - var RR [MAX_BATCH_SIZE]*G2Affine + var RR, PP [MAX_BATCH_SIZE]*G2Affine var ridx [MAX_BATCH_SIZE]int fillBenchBasesG2(P[:]) @@ -523,11 +523,12 @@ func BenchmarkBatchAddG2Affine(b *testing.B) { for i, ri := range ridx { RR[i] = &R[ri] + PP[i] = &P[ri] } b.ResetTimer() for i := 0; i < b.N; i++ { - BatchAddG2Affine(RR[:], P[:]) + batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) } } diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index 50a7c5613a..5db551830d 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -55,9 +55,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, batchSize = 1 } var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptP := 0 // count the number of point added to current batch + cptAdd := 0 // count the number of bucket + point added to current batch + cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { @@ -65,17 +66,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return cptP == batchSize + return (cptAdd + cptSub) == batchSize } executeAndReset := func() { - if cptP == 0 { + if (cptAdd + cptSub) == 0 { return } - BatchAddG1Affine(R[:cptP], P[:cptP]) + batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + var tmp BS bucketIds = tmp - cptP = 0 + cptAdd = 0 + cptSub = 0 } add := func(op batchOp) { @@ -95,28 +98,40 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } return } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(PP) { - BK.setInfinity() + if BK.X.Equal(&PP.X) { + if BK.Y.Equal(&PP.Y) { + if op.isNeg() { + // P + -P + BK.setInfinity() + return + } + // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { - BK.setInfinity() + // b.Y == -p.Y + if op.isNeg() { + // doubling . + BK.Add(BK, BK) return } + BK.setInfinity() + return } - bucketIds[op.bucketID] = true //struct{}{} - R[cptP] = BK + bucketIds[op.bucketID] = true if op.isNeg() { - P[cptP].Neg(PP) + cptSub++ + R[batchSize-cptSub] = BK + P[batchSize-cptSub] = PP } else { - P[cptP].Set(PP) + R[cptAdd] = BK + P[cptAdd] = PP + cptAdd++ } - cptP++ + } var queue [MAX_BATCH_SIZE]batchOp @@ -268,9 +283,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, batchSize = 1 } var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptP := 0 // count the number of point added to current batch + cptAdd := 0 // count the number of bucket + point added to current batch + cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { @@ -278,17 +294,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return cptP == batchSize + return (cptAdd + cptSub) == batchSize } executeAndReset := func() { - if cptP == 0 { + if (cptAdd + cptSub) == 0 { return } - BatchAddG2Affine(R[:cptP], P[:cptP]) + batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + var tmp BS bucketIds = tmp - cptP = 0 + cptAdd = 0 + cptSub = 0 } add := func(op batchOp) { @@ -308,28 +326,40 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } return } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(PP) { - BK.setInfinity() + if BK.X.Equal(&PP.X) { + if BK.Y.Equal(&PP.Y) { + if op.isNeg() { + // P + -P + BK.setInfinity() + return + } + // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { - BK.setInfinity() + // b.Y == -p.Y + if op.isNeg() { + // doubling . + BK.Add(BK, BK) return } + BK.setInfinity() + return } - bucketIds[op.bucketID] = true //struct{}{} - R[cptP] = BK + bucketIds[op.bucketID] = true if op.isNeg() { - P[cptP].Neg(PP) + cptSub++ + R[batchSize-cptSub] = BK + P[batchSize-cptSub] = PP } else { - P[cptP].Set(PP) + R[cptAdd] = BK + P[cptAdd] = PP + cptAdd++ } - cptP++ + } var queue [MAX_BATCH_SIZE]batchOp diff --git a/ecc/bls24-317/g1.go b/ecc/bls24-317/g1.go index 58bee14819..1d4e27c062 100644 --- a/ecc/bls24-317/g1.go +++ b/ecc/bls24-317/g1.go @@ -982,25 +982,29 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin return toReturnAff } -// batch add/dbl in affine coordinates +// batch add/sub in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { - batchSize := len(R) +// len(R) == len(P) == N +// R[:cptAdd], P[:cptAdd] contains points references to ADD +// R[N-cptSub:], P[N-cptSub] contains points references to SUB +// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { + batchSize := cptAdd + cptSub if batchSize == 0 { return } - var isDbl [MAX_BATCH_SIZE]bool var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + j := 0 + // add part + for j = 0; j < cptAdd; j++ { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + // sub part + for i := len(R) - cptSub; i < len(R); i++ { + lambdain[j].Sub(&P[i].X, &R[i].X) + j++ } // invert denominator @@ -1009,17 +1013,11 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { var d fp.Element var rr G1Affine - for j := 0; j < batchSize; j++ { - // computa lambda, distinguishing dbl / add - if isDbl[j] { - d.Square(&P[j].X) - lambda[j].Mul(&lambda[j], &d) - d.Double(&lambda[j]) - lambda[j].Add(&lambda[j], &d) - } else { - d.Sub(&P[j].Y, &R[j].Y) - lambda[j].Mul(&lambda[j], &d) - } + // add part + for j := 0; j < cptAdd; j++ { + // computa lambda + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) @@ -1030,6 +1028,27 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } + + // middle of the input may be ignored if cptAdd + cptSub != len(R) + offset := len(R) - batchSize + + // sub part + for j := cptAdd; j < batchSize; j++ { + // computa lambda + idx := j + offset + d.Neg(&P[idx].Y) + d.Sub(&d, &R[idx].Y) + lambda[j].Mul(&lambda[j], &d) + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[idx].X) + rr.X.Sub(&rr.X, &P[idx].X) + d.Sub(&R[idx].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[idx].Y) + R[idx].Set(&rr) + } } // batch inversion diff --git a/ecc/bls24-317/g1_test.go b/ecc/bls24-317/g1_test.go index 3673290566..59fd0c425f 100644 --- a/ecc/bls24-317/g1_test.go +++ b/ecc/bls24-317/g1_test.go @@ -502,7 +502,7 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { func BenchmarkBatchAddG1Affine(b *testing.B) { var P, R [MAX_BATCH_SIZE]G1Affine - var RR [MAX_BATCH_SIZE]*G1Affine + var RR, PP [MAX_BATCH_SIZE]*G1Affine var ridx [MAX_BATCH_SIZE]int fillBenchBasesG1(P[:]) @@ -517,11 +517,12 @@ func BenchmarkBatchAddG1Affine(b *testing.B) { for i, ri := range ridx { RR[i] = &R[ri] + PP[i] = &P[ri] } b.ResetTimer() for i := 0; i < b.N; i++ { - BatchAddG1Affine(RR[:], P[:]) + batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) } } diff --git a/ecc/bls24-317/g2.go b/ecc/bls24-317/g2.go index f5fb993fb4..bbfcfd12b2 100644 --- a/ecc/bls24-317/g2.go +++ b/ecc/bls24-317/g2.go @@ -992,25 +992,29 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin return toReturn } -// batch add/dbl in affine coordinates +// batch add/sub in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { - batchSize := len(R) +// len(R) == len(P) == N +// R[:cptAdd], P[:cptAdd] contains points references to ADD +// R[N-cptSub:], P[N-cptSub] contains points references to SUB +// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { + batchSize := cptAdd + cptSub if batchSize == 0 { return } - var isDbl [MAX_BATCH_SIZE]bool var lambda, lambdain [MAX_BATCH_SIZE]fptower.E4 - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + j := 0 + // add part + for j = 0; j < cptAdd; j++ { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + // sub part + for i := len(R) - cptSub; i < len(R); i++ { + lambdain[j].Sub(&P[i].X, &R[i].X) + j++ } // invert denominator @@ -1019,17 +1023,11 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { var d fptower.E4 var rr G2Affine - for j := 0; j < batchSize; j++ { - // computa lambda, distinguishing dbl / add - if isDbl[j] { - d.Square(&P[j].X) - lambda[j].Mul(&lambda[j], &d) - d.Double(&lambda[j]) - lambda[j].Add(&lambda[j], &d) - } else { - d.Sub(&P[j].Y, &R[j].Y) - lambda[j].Mul(&lambda[j], &d) - } + // add part + for j := 0; j < cptAdd; j++ { + // computa lambda + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) @@ -1040,6 +1038,27 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } + + // middle of the input may be ignored if cptAdd + cptSub != len(R) + offset := len(R) - batchSize + + // sub part + for j := cptAdd; j < batchSize; j++ { + // computa lambda + idx := j + offset + d.Neg(&P[idx].Y) + d.Sub(&d, &R[idx].Y) + lambda[j].Mul(&lambda[j], &d) + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[idx].X) + rr.X.Sub(&rr.X, &P[idx].X) + d.Sub(&R[idx].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[idx].Y) + R[idx].Set(&rr) + } } // batch inversion diff --git a/ecc/bls24-317/g2_test.go b/ecc/bls24-317/g2_test.go index 74c8576f89..f02c85f79d 100644 --- a/ecc/bls24-317/g2_test.go +++ b/ecc/bls24-317/g2_test.go @@ -508,7 +508,7 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { func BenchmarkBatchAddG2Affine(b *testing.B) { var P, R [MAX_BATCH_SIZE]G2Affine - var RR [MAX_BATCH_SIZE]*G2Affine + var RR, PP [MAX_BATCH_SIZE]*G2Affine var ridx [MAX_BATCH_SIZE]int fillBenchBasesG2(P[:]) @@ -523,11 +523,12 @@ func BenchmarkBatchAddG2Affine(b *testing.B) { for i, ri := range ridx { RR[i] = &R[ri] + PP[i] = &P[ri] } b.ResetTimer() for i := 0; i < b.N; i++ { - BatchAddG2Affine(RR[:], P[:]) + batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) } } diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index 82ec92b92f..b517f1ef32 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -55,9 +55,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, batchSize = 1 } var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptP := 0 // count the number of point added to current batch + cptAdd := 0 // count the number of bucket + point added to current batch + cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { @@ -65,17 +66,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return cptP == batchSize + return (cptAdd + cptSub) == batchSize } executeAndReset := func() { - if cptP == 0 { + if (cptAdd + cptSub) == 0 { return } - BatchAddG1Affine(R[:cptP], P[:cptP]) + batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + var tmp BS bucketIds = tmp - cptP = 0 + cptAdd = 0 + cptSub = 0 } add := func(op batchOp) { @@ -95,28 +98,40 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } return } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(PP) { - BK.setInfinity() + if BK.X.Equal(&PP.X) { + if BK.Y.Equal(&PP.Y) { + if op.isNeg() { + // P + -P + BK.setInfinity() + return + } + // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { - BK.setInfinity() + // b.Y == -p.Y + if op.isNeg() { + // doubling . + BK.Add(BK, BK) return } + BK.setInfinity() + return } - bucketIds[op.bucketID] = true //struct{}{} - R[cptP] = BK + bucketIds[op.bucketID] = true if op.isNeg() { - P[cptP].Neg(PP) + cptSub++ + R[batchSize-cptSub] = BK + P[batchSize-cptSub] = PP } else { - P[cptP].Set(PP) + R[cptAdd] = BK + P[cptAdd] = PP + cptAdd++ } - cptP++ + } var queue [MAX_BATCH_SIZE]batchOp @@ -268,9 +283,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, batchSize = 1 } var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptP := 0 // count the number of point added to current batch + cptAdd := 0 // count the number of bucket + point added to current batch + cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { @@ -278,17 +294,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return cptP == batchSize + return (cptAdd + cptSub) == batchSize } executeAndReset := func() { - if cptP == 0 { + if (cptAdd + cptSub) == 0 { return } - BatchAddG2Affine(R[:cptP], P[:cptP]) + batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + var tmp BS bucketIds = tmp - cptP = 0 + cptAdd = 0 + cptSub = 0 } add := func(op batchOp) { @@ -308,28 +326,40 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } return } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(PP) { - BK.setInfinity() + if BK.X.Equal(&PP.X) { + if BK.Y.Equal(&PP.Y) { + if op.isNeg() { + // P + -P + BK.setInfinity() + return + } + // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { - BK.setInfinity() + // b.Y == -p.Y + if op.isNeg() { + // doubling . + BK.Add(BK, BK) return } + BK.setInfinity() + return } - bucketIds[op.bucketID] = true //struct{}{} - R[cptP] = BK + bucketIds[op.bucketID] = true if op.isNeg() { - P[cptP].Neg(PP) + cptSub++ + R[batchSize-cptSub] = BK + P[batchSize-cptSub] = PP } else { - P[cptP].Set(PP) + R[cptAdd] = BK + P[cptAdd] = PP + cptAdd++ } - cptP++ + } var queue [MAX_BATCH_SIZE]batchOp diff --git a/ecc/bn254/g1.go b/ecc/bn254/g1.go index 80cec53604..6f9a4d4e59 100644 --- a/ecc/bn254/g1.go +++ b/ecc/bn254/g1.go @@ -952,25 +952,29 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin return toReturnAff } -// batch add/dbl in affine coordinates +// batch add/sub in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { - batchSize := len(R) +// len(R) == len(P) == N +// R[:cptAdd], P[:cptAdd] contains points references to ADD +// R[N-cptSub:], P[N-cptSub] contains points references to SUB +// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { + batchSize := cptAdd + cptSub if batchSize == 0 { return } - var isDbl [MAX_BATCH_SIZE]bool var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + j := 0 + // add part + for j = 0; j < cptAdd; j++ { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + // sub part + for i := len(R) - cptSub; i < len(R); i++ { + lambdain[j].Sub(&P[i].X, &R[i].X) + j++ } // invert denominator @@ -979,17 +983,11 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { var d fp.Element var rr G1Affine - for j := 0; j < batchSize; j++ { - // computa lambda, distinguishing dbl / add - if isDbl[j] { - d.Square(&P[j].X) - lambda[j].Mul(&lambda[j], &d) - d.Double(&lambda[j]) - lambda[j].Add(&lambda[j], &d) - } else { - d.Sub(&P[j].Y, &R[j].Y) - lambda[j].Mul(&lambda[j], &d) - } + // add part + for j := 0; j < cptAdd; j++ { + // computa lambda + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) @@ -1000,6 +998,27 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } + + // middle of the input may be ignored if cptAdd + cptSub != len(R) + offset := len(R) - batchSize + + // sub part + for j := cptAdd; j < batchSize; j++ { + // computa lambda + idx := j + offset + d.Neg(&P[idx].Y) + d.Sub(&d, &R[idx].Y) + lambda[j].Mul(&lambda[j], &d) + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[idx].X) + rr.X.Sub(&rr.X, &P[idx].X) + d.Sub(&R[idx].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[idx].Y) + R[idx].Set(&rr) + } } // batch inversion diff --git a/ecc/bn254/g1_test.go b/ecc/bn254/g1_test.go index c87502be96..ffc6160b01 100644 --- a/ecc/bn254/g1_test.go +++ b/ecc/bn254/g1_test.go @@ -463,7 +463,7 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { func BenchmarkBatchAddG1Affine(b *testing.B) { var P, R [MAX_BATCH_SIZE]G1Affine - var RR [MAX_BATCH_SIZE]*G1Affine + var RR, PP [MAX_BATCH_SIZE]*G1Affine var ridx [MAX_BATCH_SIZE]int fillBenchBasesG1(P[:]) @@ -478,11 +478,12 @@ func BenchmarkBatchAddG1Affine(b *testing.B) { for i, ri := range ridx { RR[i] = &R[ri] + PP[i] = &P[ri] } b.ResetTimer() for i := 0; i < b.N; i++ { - BatchAddG1Affine(RR[:], P[:]) + batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) } } diff --git a/ecc/bn254/g2.go b/ecc/bn254/g2.go index 79215583d5..762a6f944b 100644 --- a/ecc/bn254/g2.go +++ b/ecc/bn254/g2.go @@ -981,25 +981,29 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin return toReturn } -// batch add/dbl in affine coordinates +// batch add/sub in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { - batchSize := len(R) +// len(R) == len(P) == N +// R[:cptAdd], P[:cptAdd] contains points references to ADD +// R[N-cptSub:], P[N-cptSub] contains points references to SUB +// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { + batchSize := cptAdd + cptSub if batchSize == 0 { return } - var isDbl [MAX_BATCH_SIZE]bool var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2 - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + j := 0 + // add part + for j = 0; j < cptAdd; j++ { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + // sub part + for i := len(R) - cptSub; i < len(R); i++ { + lambdain[j].Sub(&P[i].X, &R[i].X) + j++ } // invert denominator @@ -1008,17 +1012,11 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { var d fptower.E2 var rr G2Affine - for j := 0; j < batchSize; j++ { - // computa lambda, distinguishing dbl / add - if isDbl[j] { - d.Square(&P[j].X) - lambda[j].Mul(&lambda[j], &d) - d.Double(&lambda[j]) - lambda[j].Add(&lambda[j], &d) - } else { - d.Sub(&P[j].Y, &R[j].Y) - lambda[j].Mul(&lambda[j], &d) - } + // add part + for j := 0; j < cptAdd; j++ { + // computa lambda + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) @@ -1029,6 +1027,27 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } + + // middle of the input may be ignored if cptAdd + cptSub != len(R) + offset := len(R) - batchSize + + // sub part + for j := cptAdd; j < batchSize; j++ { + // computa lambda + idx := j + offset + d.Neg(&P[idx].Y) + d.Sub(&d, &R[idx].Y) + lambda[j].Mul(&lambda[j], &d) + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[idx].X) + rr.X.Sub(&rr.X, &P[idx].X) + d.Sub(&R[idx].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[idx].Y) + R[idx].Set(&rr) + } } // batch inversion diff --git a/ecc/bn254/g2_test.go b/ecc/bn254/g2_test.go index 83d34fee91..9d0b38a11a 100644 --- a/ecc/bn254/g2_test.go +++ b/ecc/bn254/g2_test.go @@ -507,7 +507,7 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { func BenchmarkBatchAddG2Affine(b *testing.B) { var P, R [MAX_BATCH_SIZE]G2Affine - var RR [MAX_BATCH_SIZE]*G2Affine + var RR, PP [MAX_BATCH_SIZE]*G2Affine var ridx [MAX_BATCH_SIZE]int fillBenchBasesG2(P[:]) @@ -522,11 +522,12 @@ func BenchmarkBatchAddG2Affine(b *testing.B) { for i, ri := range ridx { RR[i] = &R[ri] + PP[i] = &P[ri] } b.ResetTimer() for i := 0; i < b.N; i++ { - BatchAddG2Affine(RR[:], P[:]) + batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) } } diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index b750572b22..48b5e6e242 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -55,9 +55,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, batchSize = 1 } var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptP := 0 // count the number of point added to current batch + cptAdd := 0 // count the number of bucket + point added to current batch + cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { @@ -65,17 +66,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return cptP == batchSize + return (cptAdd + cptSub) == batchSize } executeAndReset := func() { - if cptP == 0 { + if (cptAdd + cptSub) == 0 { return } - BatchAddG1Affine(R[:cptP], P[:cptP]) + batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + var tmp BS bucketIds = tmp - cptP = 0 + cptAdd = 0 + cptSub = 0 } add := func(op batchOp) { @@ -95,28 +98,40 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } return } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(PP) { - BK.setInfinity() + if BK.X.Equal(&PP.X) { + if BK.Y.Equal(&PP.Y) { + if op.isNeg() { + // P + -P + BK.setInfinity() + return + } + // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { - BK.setInfinity() + // b.Y == -p.Y + if op.isNeg() { + // doubling . + BK.Add(BK, BK) return } + BK.setInfinity() + return } - bucketIds[op.bucketID] = true //struct{}{} - R[cptP] = BK + bucketIds[op.bucketID] = true if op.isNeg() { - P[cptP].Neg(PP) + cptSub++ + R[batchSize-cptSub] = BK + P[batchSize-cptSub] = PP } else { - P[cptP].Set(PP) + R[cptAdd] = BK + P[cptAdd] = PP + cptAdd++ } - cptP++ + } var queue [MAX_BATCH_SIZE]batchOp @@ -268,9 +283,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, batchSize = 1 } var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptP := 0 // count the number of point added to current batch + cptAdd := 0 // count the number of bucket + point added to current batch + cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { @@ -278,17 +294,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return cptP == batchSize + return (cptAdd + cptSub) == batchSize } executeAndReset := func() { - if cptP == 0 { + if (cptAdd + cptSub) == 0 { return } - BatchAddG2Affine(R[:cptP], P[:cptP]) + batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + var tmp BS bucketIds = tmp - cptP = 0 + cptAdd = 0 + cptSub = 0 } add := func(op batchOp) { @@ -308,28 +326,40 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } return } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(PP) { - BK.setInfinity() + if BK.X.Equal(&PP.X) { + if BK.Y.Equal(&PP.Y) { + if op.isNeg() { + // P + -P + BK.setInfinity() + return + } + // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { - BK.setInfinity() + // b.Y == -p.Y + if op.isNeg() { + // doubling . + BK.Add(BK, BK) return } + BK.setInfinity() + return } - bucketIds[op.bucketID] = true //struct{}{} - R[cptP] = BK + bucketIds[op.bucketID] = true if op.isNeg() { - P[cptP].Neg(PP) + cptSub++ + R[batchSize-cptSub] = BK + P[batchSize-cptSub] = PP } else { - P[cptP].Set(PP) + R[cptAdd] = BK + P[cptAdd] = PP + cptAdd++ } - cptP++ + } var queue [MAX_BATCH_SIZE]batchOp diff --git a/ecc/bw6-633/g1.go b/ecc/bw6-633/g1.go index 41a18cf2af..9e61e67732 100644 --- a/ecc/bw6-633/g1.go +++ b/ecc/bw6-633/g1.go @@ -1084,25 +1084,29 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin return toReturnAff } -// batch add/dbl in affine coordinates +// batch add/sub in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { - batchSize := len(R) +// len(R) == len(P) == N +// R[:cptAdd], P[:cptAdd] contains points references to ADD +// R[N-cptSub:], P[N-cptSub] contains points references to SUB +// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { + batchSize := cptAdd + cptSub if batchSize == 0 { return } - var isDbl [MAX_BATCH_SIZE]bool var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + j := 0 + // add part + for j = 0; j < cptAdd; j++ { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + // sub part + for i := len(R) - cptSub; i < len(R); i++ { + lambdain[j].Sub(&P[i].X, &R[i].X) + j++ } // invert denominator @@ -1111,17 +1115,11 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { var d fp.Element var rr G1Affine - for j := 0; j < batchSize; j++ { - // computa lambda, distinguishing dbl / add - if isDbl[j] { - d.Square(&P[j].X) - lambda[j].Mul(&lambda[j], &d) - d.Double(&lambda[j]) - lambda[j].Add(&lambda[j], &d) - } else { - d.Sub(&P[j].Y, &R[j].Y) - lambda[j].Mul(&lambda[j], &d) - } + // add part + for j := 0; j < cptAdd; j++ { + // computa lambda + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) @@ -1132,6 +1130,27 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } + + // middle of the input may be ignored if cptAdd + cptSub != len(R) + offset := len(R) - batchSize + + // sub part + for j := cptAdd; j < batchSize; j++ { + // computa lambda + idx := j + offset + d.Neg(&P[idx].Y) + d.Sub(&d, &R[idx].Y) + lambda[j].Mul(&lambda[j], &d) + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[idx].X) + rr.X.Sub(&rr.X, &P[idx].X) + d.Sub(&R[idx].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[idx].Y) + R[idx].Set(&rr) + } } // batch inversion diff --git a/ecc/bw6-633/g1_test.go b/ecc/bw6-633/g1_test.go index 827cee65dd..91e28e75e0 100644 --- a/ecc/bw6-633/g1_test.go +++ b/ecc/bw6-633/g1_test.go @@ -502,7 +502,7 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { func BenchmarkBatchAddG1Affine(b *testing.B) { var P, R [MAX_BATCH_SIZE]G1Affine - var RR [MAX_BATCH_SIZE]*G1Affine + var RR, PP [MAX_BATCH_SIZE]*G1Affine var ridx [MAX_BATCH_SIZE]int fillBenchBasesG1(P[:]) @@ -517,11 +517,12 @@ func BenchmarkBatchAddG1Affine(b *testing.B) { for i, ri := range ridx { RR[i] = &R[ri] + PP[i] = &P[ri] } b.ResetTimer() for i := 0; i < b.N; i++ { - BatchAddG1Affine(RR[:], P[:]) + batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) } } diff --git a/ecc/bw6-633/g2.go b/ecc/bw6-633/g2.go index de70170a12..6f021168b5 100644 --- a/ecc/bw6-633/g2.go +++ b/ecc/bw6-633/g2.go @@ -947,25 +947,29 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin return toReturn } -// batch add/dbl in affine coordinates +// batch add/sub in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { - batchSize := len(R) +// len(R) == len(P) == N +// R[:cptAdd], P[:cptAdd] contains points references to ADD +// R[N-cptSub:], P[N-cptSub] contains points references to SUB +// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { + batchSize := cptAdd + cptSub if batchSize == 0 { return } - var isDbl [MAX_BATCH_SIZE]bool var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + j := 0 + // add part + for j = 0; j < cptAdd; j++ { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + // sub part + for i := len(R) - cptSub; i < len(R); i++ { + lambdain[j].Sub(&P[i].X, &R[i].X) + j++ } // invert denominator @@ -974,17 +978,11 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { var d fp.Element var rr G2Affine - for j := 0; j < batchSize; j++ { - // computa lambda, distinguishing dbl / add - if isDbl[j] { - d.Square(&P[j].X) - lambda[j].Mul(&lambda[j], &d) - d.Double(&lambda[j]) - lambda[j].Add(&lambda[j], &d) - } else { - d.Sub(&P[j].Y, &R[j].Y) - lambda[j].Mul(&lambda[j], &d) - } + // add part + for j := 0; j < cptAdd; j++ { + // computa lambda + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) @@ -995,6 +993,27 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } + + // middle of the input may be ignored if cptAdd + cptSub != len(R) + offset := len(R) - batchSize + + // sub part + for j := cptAdd; j < batchSize; j++ { + // computa lambda + idx := j + offset + d.Neg(&P[idx].Y) + d.Sub(&d, &R[idx].Y) + lambda[j].Mul(&lambda[j], &d) + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[idx].X) + rr.X.Sub(&rr.X, &P[idx].X) + d.Sub(&R[idx].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[idx].Y) + R[idx].Set(&rr) + } } // batch inversion diff --git a/ecc/bw6-633/g2_test.go b/ecc/bw6-633/g2_test.go index 82ddc5385b..a51ae94c50 100644 --- a/ecc/bw6-633/g2_test.go +++ b/ecc/bw6-633/g2_test.go @@ -489,7 +489,7 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { func BenchmarkBatchAddG2Affine(b *testing.B) { var P, R [MAX_BATCH_SIZE]G2Affine - var RR [MAX_BATCH_SIZE]*G2Affine + var RR, PP [MAX_BATCH_SIZE]*G2Affine var ridx [MAX_BATCH_SIZE]int fillBenchBasesG2(P[:]) @@ -504,11 +504,12 @@ func BenchmarkBatchAddG2Affine(b *testing.B) { for i, ri := range ridx { RR[i] = &R[ri] + PP[i] = &P[ri] } b.ResetTimer() for i := 0; i < b.N; i++ { - BatchAddG2Affine(RR[:], P[:]) + batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) } } diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index 12762830f9..74c9b3d4dc 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -55,9 +55,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, batchSize = 1 } var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptP := 0 // count the number of point added to current batch + cptAdd := 0 // count the number of bucket + point added to current batch + cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { @@ -65,17 +66,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return cptP == batchSize + return (cptAdd + cptSub) == batchSize } executeAndReset := func() { - if cptP == 0 { + if (cptAdd + cptSub) == 0 { return } - BatchAddG1Affine(R[:cptP], P[:cptP]) + batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + var tmp BS bucketIds = tmp - cptP = 0 + cptAdd = 0 + cptSub = 0 } add := func(op batchOp) { @@ -95,28 +98,40 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } return } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(PP) { - BK.setInfinity() + if BK.X.Equal(&PP.X) { + if BK.Y.Equal(&PP.Y) { + if op.isNeg() { + // P + -P + BK.setInfinity() + return + } + // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { - BK.setInfinity() + // b.Y == -p.Y + if op.isNeg() { + // doubling . + BK.Add(BK, BK) return } + BK.setInfinity() + return } - bucketIds[op.bucketID] = true //struct{}{} - R[cptP] = BK + bucketIds[op.bucketID] = true if op.isNeg() { - P[cptP].Neg(PP) + cptSub++ + R[batchSize-cptSub] = BK + P[batchSize-cptSub] = PP } else { - P[cptP].Set(PP) + R[cptAdd] = BK + P[cptAdd] = PP + cptAdd++ } - cptP++ + } var queue [MAX_BATCH_SIZE]batchOp @@ -250,9 +265,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, batchSize = 1 } var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptP := 0 // count the number of point added to current batch + cptAdd := 0 // count the number of bucket + point added to current batch + cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { @@ -260,17 +276,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return cptP == batchSize + return (cptAdd + cptSub) == batchSize } executeAndReset := func() { - if cptP == 0 { + if (cptAdd + cptSub) == 0 { return } - BatchAddG2Affine(R[:cptP], P[:cptP]) + batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + var tmp BS bucketIds = tmp - cptP = 0 + cptAdd = 0 + cptSub = 0 } add := func(op batchOp) { @@ -290,28 +308,40 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } return } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(PP) { - BK.setInfinity() + if BK.X.Equal(&PP.X) { + if BK.Y.Equal(&PP.Y) { + if op.isNeg() { + // P + -P + BK.setInfinity() + return + } + // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { - BK.setInfinity() + // b.Y == -p.Y + if op.isNeg() { + // doubling . + BK.Add(BK, BK) return } + BK.setInfinity() + return } - bucketIds[op.bucketID] = true //struct{}{} - R[cptP] = BK + bucketIds[op.bucketID] = true if op.isNeg() { - P[cptP].Neg(PP) + cptSub++ + R[batchSize-cptSub] = BK + P[batchSize-cptSub] = PP } else { - P[cptP].Set(PP) + R[cptAdd] = BK + P[cptAdd] = PP + cptAdd++ } - cptP++ + } var queue [MAX_BATCH_SIZE]batchOp diff --git a/ecc/bw6-756/g1.go b/ecc/bw6-756/g1.go index e1c7e9056a..fee3c6884b 100644 --- a/ecc/bw6-756/g1.go +++ b/ecc/bw6-756/g1.go @@ -1084,25 +1084,29 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin return toReturnAff } -// batch add/dbl in affine coordinates +// batch add/sub in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { - batchSize := len(R) +// len(R) == len(P) == N +// R[:cptAdd], P[:cptAdd] contains points references to ADD +// R[N-cptSub:], P[N-cptSub] contains points references to SUB +// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { + batchSize := cptAdd + cptSub if batchSize == 0 { return } - var isDbl [MAX_BATCH_SIZE]bool var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + j := 0 + // add part + for j = 0; j < cptAdd; j++ { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + // sub part + for i := len(R) - cptSub; i < len(R); i++ { + lambdain[j].Sub(&P[i].X, &R[i].X) + j++ } // invert denominator @@ -1111,17 +1115,11 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { var d fp.Element var rr G1Affine - for j := 0; j < batchSize; j++ { - // computa lambda, distinguishing dbl / add - if isDbl[j] { - d.Square(&P[j].X) - lambda[j].Mul(&lambda[j], &d) - d.Double(&lambda[j]) - lambda[j].Add(&lambda[j], &d) - } else { - d.Sub(&P[j].Y, &R[j].Y) - lambda[j].Mul(&lambda[j], &d) - } + // add part + for j := 0; j < cptAdd; j++ { + // computa lambda + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) @@ -1132,6 +1130,27 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } + + // middle of the input may be ignored if cptAdd + cptSub != len(R) + offset := len(R) - batchSize + + // sub part + for j := cptAdd; j < batchSize; j++ { + // computa lambda + idx := j + offset + d.Neg(&P[idx].Y) + d.Sub(&d, &R[idx].Y) + lambda[j].Mul(&lambda[j], &d) + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[idx].X) + rr.X.Sub(&rr.X, &P[idx].X) + d.Sub(&R[idx].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[idx].Y) + R[idx].Set(&rr) + } } // batch inversion diff --git a/ecc/bw6-756/g1_test.go b/ecc/bw6-756/g1_test.go index fc64f7646c..cfc93383c9 100644 --- a/ecc/bw6-756/g1_test.go +++ b/ecc/bw6-756/g1_test.go @@ -502,7 +502,7 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { func BenchmarkBatchAddG1Affine(b *testing.B) { var P, R [MAX_BATCH_SIZE]G1Affine - var RR [MAX_BATCH_SIZE]*G1Affine + var RR, PP [MAX_BATCH_SIZE]*G1Affine var ridx [MAX_BATCH_SIZE]int fillBenchBasesG1(P[:]) @@ -517,11 +517,12 @@ func BenchmarkBatchAddG1Affine(b *testing.B) { for i, ri := range ridx { RR[i] = &R[ri] + PP[i] = &P[ri] } b.ResetTimer() for i := 0; i < b.N; i++ { - BatchAddG1Affine(RR[:], P[:]) + batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) } } diff --git a/ecc/bw6-756/g2.go b/ecc/bw6-756/g2.go index 5302819c4b..195322273e 100644 --- a/ecc/bw6-756/g2.go +++ b/ecc/bw6-756/g2.go @@ -941,25 +941,29 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin return toReturn } -// batch add/dbl in affine coordinates +// batch add/sub in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { - batchSize := len(R) +// len(R) == len(P) == N +// R[:cptAdd], P[:cptAdd] contains points references to ADD +// R[N-cptSub:], P[N-cptSub] contains points references to SUB +// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { + batchSize := cptAdd + cptSub if batchSize == 0 { return } - var isDbl [MAX_BATCH_SIZE]bool var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + j := 0 + // add part + for j = 0; j < cptAdd; j++ { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + // sub part + for i := len(R) - cptSub; i < len(R); i++ { + lambdain[j].Sub(&P[i].X, &R[i].X) + j++ } // invert denominator @@ -968,17 +972,11 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { var d fp.Element var rr G2Affine - for j := 0; j < batchSize; j++ { - // computa lambda, distinguishing dbl / add - if isDbl[j] { - d.Square(&P[j].X) - lambda[j].Mul(&lambda[j], &d) - d.Double(&lambda[j]) - lambda[j].Add(&lambda[j], &d) - } else { - d.Sub(&P[j].Y, &R[j].Y) - lambda[j].Mul(&lambda[j], &d) - } + // add part + for j := 0; j < cptAdd; j++ { + // computa lambda + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) @@ -989,6 +987,27 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } + + // middle of the input may be ignored if cptAdd + cptSub != len(R) + offset := len(R) - batchSize + + // sub part + for j := cptAdd; j < batchSize; j++ { + // computa lambda + idx := j + offset + d.Neg(&P[idx].Y) + d.Sub(&d, &R[idx].Y) + lambda[j].Mul(&lambda[j], &d) + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[idx].X) + rr.X.Sub(&rr.X, &P[idx].X) + d.Sub(&R[idx].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[idx].Y) + R[idx].Set(&rr) + } } // batch inversion diff --git a/ecc/bw6-756/g2_test.go b/ecc/bw6-756/g2_test.go index 065dc4432e..699df087ed 100644 --- a/ecc/bw6-756/g2_test.go +++ b/ecc/bw6-756/g2_test.go @@ -489,7 +489,7 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { func BenchmarkBatchAddG2Affine(b *testing.B) { var P, R [MAX_BATCH_SIZE]G2Affine - var RR [MAX_BATCH_SIZE]*G2Affine + var RR, PP [MAX_BATCH_SIZE]*G2Affine var ridx [MAX_BATCH_SIZE]int fillBenchBasesG2(P[:]) @@ -504,11 +504,12 @@ func BenchmarkBatchAddG2Affine(b *testing.B) { for i, ri := range ridx { RR[i] = &R[ri] + PP[i] = &P[ri] } b.ResetTimer() for i := 0; i < b.N; i++ { - BatchAddG2Affine(RR[:], P[:]) + batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) } } diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index e4748e2c8b..02bc11523c 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -55,9 +55,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, batchSize = 1 } var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptP := 0 // count the number of point added to current batch + cptAdd := 0 // count the number of bucket + point added to current batch + cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { @@ -65,17 +66,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return cptP == batchSize + return (cptAdd + cptSub) == batchSize } executeAndReset := func() { - if cptP == 0 { + if (cptAdd + cptSub) == 0 { return } - BatchAddG1Affine(R[:cptP], P[:cptP]) + batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + var tmp BS bucketIds = tmp - cptP = 0 + cptAdd = 0 + cptSub = 0 } add := func(op batchOp) { @@ -95,28 +98,40 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } return } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(PP) { - BK.setInfinity() + if BK.X.Equal(&PP.X) { + if BK.Y.Equal(&PP.Y) { + if op.isNeg() { + // P + -P + BK.setInfinity() + return + } + // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { - BK.setInfinity() + // b.Y == -p.Y + if op.isNeg() { + // doubling . + BK.Add(BK, BK) return } + BK.setInfinity() + return } - bucketIds[op.bucketID] = true //struct{}{} - R[cptP] = BK + bucketIds[op.bucketID] = true if op.isNeg() { - P[cptP].Neg(PP) + cptSub++ + R[batchSize-cptSub] = BK + P[batchSize-cptSub] = PP } else { - P[cptP].Set(PP) + R[cptAdd] = BK + P[cptAdd] = PP + cptAdd++ } - cptP++ + } var queue [MAX_BATCH_SIZE]batchOp @@ -250,9 +265,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, batchSize = 1 } var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptP := 0 // count the number of point added to current batch + cptAdd := 0 // count the number of bucket + point added to current batch + cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { @@ -260,17 +276,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return cptP == batchSize + return (cptAdd + cptSub) == batchSize } executeAndReset := func() { - if cptP == 0 { + if (cptAdd + cptSub) == 0 { return } - BatchAddG2Affine(R[:cptP], P[:cptP]) + batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + var tmp BS bucketIds = tmp - cptP = 0 + cptAdd = 0 + cptSub = 0 } add := func(op batchOp) { @@ -290,28 +308,40 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } return } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(PP) { - BK.setInfinity() + if BK.X.Equal(&PP.X) { + if BK.Y.Equal(&PP.Y) { + if op.isNeg() { + // P + -P + BK.setInfinity() + return + } + // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { - BK.setInfinity() + // b.Y == -p.Y + if op.isNeg() { + // doubling . + BK.Add(BK, BK) return } + BK.setInfinity() + return } - bucketIds[op.bucketID] = true //struct{}{} - R[cptP] = BK + bucketIds[op.bucketID] = true if op.isNeg() { - P[cptP].Neg(PP) + cptSub++ + R[batchSize-cptSub] = BK + P[batchSize-cptSub] = PP } else { - P[cptP].Set(PP) + R[cptAdd] = BK + P[cptAdd] = PP + cptAdd++ } - cptP++ + } var queue [MAX_BATCH_SIZE]batchOp diff --git a/ecc/bw6-761/g1.go b/ecc/bw6-761/g1.go index 86b99ebd1a..3537495b46 100644 --- a/ecc/bw6-761/g1.go +++ b/ecc/bw6-761/g1.go @@ -1095,25 +1095,29 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin return toReturnAff } -// batch add/dbl in affine coordinates +// batch add/sub in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { - batchSize := len(R) +// len(R) == len(P) == N +// R[:cptAdd], P[:cptAdd] contains points references to ADD +// R[N-cptSub:], P[N-cptSub] contains points references to SUB +// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { + batchSize := cptAdd + cptSub if batchSize == 0 { return } - var isDbl [MAX_BATCH_SIZE]bool var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + j := 0 + // add part + for j = 0; j < cptAdd; j++ { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + // sub part + for i := len(R) - cptSub; i < len(R); i++ { + lambdain[j].Sub(&P[i].X, &R[i].X) + j++ } // invert denominator @@ -1122,17 +1126,11 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { var d fp.Element var rr G1Affine - for j := 0; j < batchSize; j++ { - // computa lambda, distinguishing dbl / add - if isDbl[j] { - d.Square(&P[j].X) - lambda[j].Mul(&lambda[j], &d) - d.Double(&lambda[j]) - lambda[j].Add(&lambda[j], &d) - } else { - d.Sub(&P[j].Y, &R[j].Y) - lambda[j].Mul(&lambda[j], &d) - } + // add part + for j := 0; j < cptAdd; j++ { + // computa lambda + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) @@ -1143,6 +1141,27 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } + + // middle of the input may be ignored if cptAdd + cptSub != len(R) + offset := len(R) - batchSize + + // sub part + for j := cptAdd; j < batchSize; j++ { + // computa lambda + idx := j + offset + d.Neg(&P[idx].Y) + d.Sub(&d, &R[idx].Y) + lambda[j].Mul(&lambda[j], &d) + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[idx].X) + rr.X.Sub(&rr.X, &P[idx].X) + d.Sub(&R[idx].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[idx].Y) + R[idx].Set(&rr) + } } // batch inversion diff --git a/ecc/bw6-761/g1_test.go b/ecc/bw6-761/g1_test.go index 3be460742f..5b1b389102 100644 --- a/ecc/bw6-761/g1_test.go +++ b/ecc/bw6-761/g1_test.go @@ -502,7 +502,7 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { func BenchmarkBatchAddG1Affine(b *testing.B) { var P, R [MAX_BATCH_SIZE]G1Affine - var RR [MAX_BATCH_SIZE]*G1Affine + var RR, PP [MAX_BATCH_SIZE]*G1Affine var ridx [MAX_BATCH_SIZE]int fillBenchBasesG1(P[:]) @@ -517,11 +517,12 @@ func BenchmarkBatchAddG1Affine(b *testing.B) { for i, ri := range ridx { RR[i] = &R[ri] + PP[i] = &P[ri] } b.ResetTimer() for i := 0; i < b.N; i++ { - BatchAddG1Affine(RR[:], P[:]) + batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) } } diff --git a/ecc/bw6-761/g2.go b/ecc/bw6-761/g2.go index 77c4e1d375..41cfea623f 100644 --- a/ecc/bw6-761/g2.go +++ b/ecc/bw6-761/g2.go @@ -955,25 +955,29 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin return toReturn } -// batch add/dbl in affine coordinates +// batch add/sub in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { - batchSize := len(R) +// len(R) == len(P) == N +// R[:cptAdd], P[:cptAdd] contains points references to ADD +// R[N-cptSub:], P[N-cptSub] contains points references to SUB +// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { + batchSize := cptAdd + cptSub if batchSize == 0 { return } - var isDbl [MAX_BATCH_SIZE]bool var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + j := 0 + // add part + for j = 0; j < cptAdd; j++ { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + // sub part + for i := len(R) - cptSub; i < len(R); i++ { + lambdain[j].Sub(&P[i].X, &R[i].X) + j++ } // invert denominator @@ -982,17 +986,11 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { var d fp.Element var rr G2Affine - for j := 0; j < batchSize; j++ { - // computa lambda, distinguishing dbl / add - if isDbl[j] { - d.Square(&P[j].X) - lambda[j].Mul(&lambda[j], &d) - d.Double(&lambda[j]) - lambda[j].Add(&lambda[j], &d) - } else { - d.Sub(&P[j].Y, &R[j].Y) - lambda[j].Mul(&lambda[j], &d) - } + // add part + for j := 0; j < cptAdd; j++ { + // computa lambda + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) @@ -1003,6 +1001,27 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } + + // middle of the input may be ignored if cptAdd + cptSub != len(R) + offset := len(R) - batchSize + + // sub part + for j := cptAdd; j < batchSize; j++ { + // computa lambda + idx := j + offset + d.Neg(&P[idx].Y) + d.Sub(&d, &R[idx].Y) + lambda[j].Mul(&lambda[j], &d) + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[idx].X) + rr.X.Sub(&rr.X, &P[idx].X) + d.Sub(&R[idx].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[idx].Y) + R[idx].Set(&rr) + } } // batch inversion diff --git a/ecc/bw6-761/g2_test.go b/ecc/bw6-761/g2_test.go index 0268875661..76d8b7f7de 100644 --- a/ecc/bw6-761/g2_test.go +++ b/ecc/bw6-761/g2_test.go @@ -489,7 +489,7 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { func BenchmarkBatchAddG2Affine(b *testing.B) { var P, R [MAX_BATCH_SIZE]G2Affine - var RR [MAX_BATCH_SIZE]*G2Affine + var RR, PP [MAX_BATCH_SIZE]*G2Affine var ridx [MAX_BATCH_SIZE]int fillBenchBasesG2(P[:]) @@ -504,11 +504,12 @@ func BenchmarkBatchAddG2Affine(b *testing.B) { for i, ri := range ridx { RR[i] = &R[ri] + PP[i] = &P[ri] } b.ResetTimer() for i := 0; i < b.N; i++ { - BatchAddG2Affine(RR[:], P[:]) + batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) } } diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index 167bdf2902..4c9c97691d 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -55,9 +55,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, batchSize = 1 } var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptP := 0 // count the number of point added to current batch + cptAdd := 0 // count the number of bucket + point added to current batch + cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { @@ -65,17 +66,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return cptP == batchSize + return (cptAdd + cptSub) == batchSize } executeAndReset := func() { - if cptP == 0 { + if (cptAdd + cptSub) == 0 { return } - BatchAddG1Affine(R[:cptP], P[:cptP]) + batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + var tmp BS bucketIds = tmp - cptP = 0 + cptAdd = 0 + cptSub = 0 } add := func(op batchOp) { @@ -95,28 +98,40 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } return } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(PP) { - BK.setInfinity() + if BK.X.Equal(&PP.X) { + if BK.Y.Equal(&PP.Y) { + if op.isNeg() { + // P + -P + BK.setInfinity() + return + } + // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { - BK.setInfinity() + // b.Y == -p.Y + if op.isNeg() { + // doubling . + BK.Add(BK, BK) return } + BK.setInfinity() + return } - bucketIds[op.bucketID] = true //struct{}{} - R[cptP] = BK + bucketIds[op.bucketID] = true if op.isNeg() { - P[cptP].Neg(PP) + cptSub++ + R[batchSize-cptSub] = BK + P[batchSize-cptSub] = PP } else { - P[cptP].Set(PP) + R[cptAdd] = BK + P[cptAdd] = PP + cptAdd++ } - cptP++ + } var queue [MAX_BATCH_SIZE]batchOp @@ -250,9 +265,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, batchSize = 1 } var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptP := 0 // count the number of point added to current batch + cptAdd := 0 // count the number of bucket + point added to current batch + cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { @@ -260,17 +276,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return cptP == batchSize + return (cptAdd + cptSub) == batchSize } executeAndReset := func() { - if cptP == 0 { + if (cptAdd + cptSub) == 0 { return } - BatchAddG2Affine(R[:cptP], P[:cptP]) + batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + var tmp BS bucketIds = tmp - cptP = 0 + cptAdd = 0 + cptSub = 0 } add := func(op batchOp) { @@ -290,28 +308,40 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } return } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(PP) { - BK.setInfinity() + if BK.X.Equal(&PP.X) { + if BK.Y.Equal(&PP.Y) { + if op.isNeg() { + // P + -P + BK.setInfinity() + return + } + // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { - BK.setInfinity() + // b.Y == -p.Y + if op.isNeg() { + // doubling . + BK.Add(BK, BK) return } + BK.setInfinity() + return } - bucketIds[op.bucketID] = true //struct{}{} - R[cptP] = BK + bucketIds[op.bucketID] = true if op.isNeg() { - P[cptP].Neg(PP) + cptSub++ + R[batchSize-cptSub] = BK + P[batchSize-cptSub] = PP } else { - P[cptP].Set(PP) + R[cptAdd] = BK + P[cptAdd] = PP + cptAdd++ } - cptP++ + } var queue [MAX_BATCH_SIZE]batchOp diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index 16071451e5..8902aeb919 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -56,9 +56,10 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c batchSize = 1 } var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptP := 0 // count the number of point added to current batch + cptAdd := 0 // count the number of bucket + point added to current batch + cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]{{ $.TAffine }} // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]*{{ $.TAffine }} // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*{{ $.TAffine }} // bucket references canAdd := func(bID uint32) bool { @@ -66,17 +67,19 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c } isFull := func() bool { - return cptP == batchSize + return (cptAdd+cptSub) == batchSize } executeAndReset := func () { - if cptP == 0 { + if (cptAdd+cptSub) == 0 { return } - BatchAdd{{ $.TAffine }}(R[:cptP], P[:cptP]) + batchAdd{{ $.TAffine }}(R[:batchSize], P[:batchSize], cptAdd, cptSub) + var tmp BS bucketIds = tmp - cptP = 0 + cptAdd = 0 + cptSub = 0 } add := func(op batchOp) { @@ -96,28 +99,41 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c } return } - if op.isNeg() { - // if bucket == P --> -P == 0 - if BK.Equal(PP) { - BK.setInfinity() + if BK.X.Equal(&PP.X) { + if BK.Y.Equal(&PP.Y) { + if op.isNeg() { + // P + -P + BK.setInfinity() + return + } + // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) return } - } else { - // if bucket == -P, B == 0 - if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) { - BK.setInfinity() + // b.Y == -p.Y + if op.isNeg() { + // doubling . + BK.Add(BK, BK) return } + BK.setInfinity() + return } + - bucketIds[op.bucketID] = true //struct{}{} - R[cptP] = BK + bucketIds[op.bucketID] = true if op.isNeg() { - P[cptP].Neg(PP) + cptSub++ + R[batchSize - cptSub] = BK + P[batchSize - cptSub] = PP } else { - P[cptP].Set(PP) + R[cptAdd] = BK + P[cptAdd] = PP + cptAdd++ } - cptP++ + } diff --git a/internal/generator/ecc/template/point.go.tmpl b/internal/generator/ecc/template/point.go.tmpl index c88ca29b88..d1107bb1c2 100644 --- a/internal/generator/ecc/template/point.go.tmpl +++ b/internal/generator/ecc/template/point.go.tmpl @@ -1571,26 +1571,30 @@ func BatchScalarMultiplication{{ toUpper .PointName }}(base *{{ $TAffine }}, sca -// batch add/dbl in affine coordinates +// batch add/sub in affine coordinates // using batch inversion // cost add: 5*batchSize M + 1I, dbl: +1M -func BatchAdd{{ $TAffine }}(R []*{{ $TAffine }}, P []{{ $TAffine }}) { - batchSize := len(R) +// len(R) == len(P) == N +// R[:cptAdd], P[:cptAdd] contains points references to ADD +// R[N-cptSub:], P[N-cptSub] contains points references to SUB +// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +func batchAdd{{ $TAffine }}(R,P []*{{ $TAffine }}, cptAdd, cptSub int) { + batchSize := cptAdd + cptSub if batchSize == 0 { return } - var isDbl [MAX_BATCH_SIZE]bool var lambda, lambdain [MAX_BATCH_SIZE]{{.CoordType}} - for j := 0; j < batchSize; j++ { - // detect dbl vs add & compute denominator - if P[j].Equal(R[j]) { - isDbl[j] = true - lambdain[j].Double(&P[j].Y) - } else { - lambdain[j].Sub(&P[j].X, &R[j].X) - } + j := 0 + // add part + for j = 0; j < cptAdd; j++ { + lambdain[j].Sub(&P[j].X, &R[j].X) + } + // sub part + for i:=len(R) - cptSub ;i < len(R); i++ { + lambdain[j].Sub(&P[i].X, &R[i].X) + j++ } // invert denominator @@ -1599,17 +1603,11 @@ func BatchAdd{{ $TAffine }}(R []*{{ $TAffine }}, P []{{ $TAffine }}) { var d {{.CoordType}} var rr {{ $TAffine }} - for j := 0; j < batchSize; j++ { - // computa lambda, distinguishing dbl / add - if isDbl[j] { - d.Square(&P[j].X) - lambda[j].Mul(&lambda[j], &d) - d.Double(&lambda[j]) - lambda[j].Add(&lambda[j], &d) - } else { - d.Sub(&P[j].Y, &R[j].Y) - lambda[j].Mul(&lambda[j], &d) - } + // add part + for j := 0; j < cptAdd; j++ { + // computa lambda + d.Sub(&P[j].Y, &R[j].Y) + lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) @@ -1620,6 +1618,27 @@ func BatchAdd{{ $TAffine }}(R []*{{ $TAffine }}, P []{{ $TAffine }}) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } + + // middle of the input may be ignored if cptAdd + cptSub != len(R) + offset := len(R) - batchSize + + // sub part + for j := cptAdd; j < batchSize; j++ { + // computa lambda + idx := j+offset + d.Neg(&P[idx].Y) + d.Sub(&d, &R[idx].Y) + lambda[j].Mul(&lambda[j], &d) + + // compute X, Y + rr.X.Square(&lambda[j]) + rr.X.Sub(&rr.X, &R[idx].X) + rr.X.Sub(&rr.X, &P[idx].X) + d.Sub(&R[idx].X, &rr.X) + rr.Y.Mul(&lambda[j], &d) + rr.Y.Sub(&rr.Y, &R[idx].Y) + R[idx].Set(&rr) + } } diff --git a/internal/generator/ecc/template/tests/point.go.tmpl b/internal/generator/ecc/template/tests/point.go.tmpl index e033f96dd7..ee54b2dd1b 100644 --- a/internal/generator/ecc/template/tests/point.go.tmpl +++ b/internal/generator/ecc/template/tests/point.go.tmpl @@ -562,7 +562,7 @@ func Benchmark{{ $TJacobian }}IsInSubGroup(b *testing.B) { func BenchmarkBatchAdd{{ $TAffine }}(b *testing.B) { var P, R [MAX_BATCH_SIZE]{{ $TAffine }} - var RR [MAX_BATCH_SIZE]*{{ $TAffine }} + var RR, PP [MAX_BATCH_SIZE]*{{ $TAffine }} var ridx [MAX_BATCH_SIZE]int fillBenchBases{{ toUpper $.PointName }}(P[:]) @@ -577,11 +577,12 @@ func BenchmarkBatchAdd{{ $TAffine }}(b *testing.B) { for i, ri := range ridx { RR[i] = &R[ri] + PP[i] = &P[ri] } b.ResetTimer() for i := 0; i < b.N; i++ { - BatchAdd{{ $TAffine }}(RR[:], P[:]) + batchAdd{{ $TAffine }}(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) } } From decae893557b67a921146a1fe4f1154d90412f8a Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Mon, 14 Nov 2022 12:03:27 -0600 Subject: [PATCH 17/43] feat: revert part of previous commit --- ecc/bls12-377/g1.go | 35 ++-------------- ecc/bls12-377/g1_test.go | 41 +++++++++--------- ecc/bls12-377/g2.go | 35 ++-------------- ecc/bls12-377/g2_test.go | 41 +++++++++--------- ecc/bls12-377/multiexp_affine.go | 42 +++++++------------ ecc/bls12-378/g1.go | 35 ++-------------- ecc/bls12-378/g1_test.go | 41 +++++++++--------- ecc/bls12-378/g2.go | 35 ++-------------- ecc/bls12-378/g2_test.go | 41 +++++++++--------- ecc/bls12-378/multiexp_affine.go | 42 +++++++------------ ecc/bls12-381/g1.go | 35 ++-------------- ecc/bls12-381/g1_test.go | 41 +++++++++--------- ecc/bls12-381/g2.go | 35 ++-------------- ecc/bls12-381/g2_test.go | 41 +++++++++--------- ecc/bls12-381/multiexp_affine.go | 42 +++++++------------ ecc/bls24-315/g1.go | 35 ++-------------- ecc/bls24-315/g1_test.go | 41 +++++++++--------- ecc/bls24-315/g2.go | 35 ++-------------- ecc/bls24-315/g2_test.go | 41 +++++++++--------- ecc/bls24-315/multiexp_affine.go | 42 +++++++------------ ecc/bls24-317/g1.go | 35 ++-------------- ecc/bls24-317/g1_test.go | 41 +++++++++--------- ecc/bls24-317/g2.go | 35 ++-------------- ecc/bls24-317/g2_test.go | 41 +++++++++--------- ecc/bls24-317/multiexp_affine.go | 42 +++++++------------ ecc/bn254/g1.go | 35 ++-------------- ecc/bn254/g1_test.go | 41 +++++++++--------- ecc/bn254/g2.go | 35 ++-------------- ecc/bn254/g2_test.go | 41 +++++++++--------- ecc/bn254/multiexp_affine.go | 42 +++++++------------ ecc/bw6-633/g1.go | 35 ++-------------- ecc/bw6-633/g1_test.go | 41 +++++++++--------- ecc/bw6-633/g2.go | 35 ++-------------- ecc/bw6-633/g2_test.go | 41 +++++++++--------- ecc/bw6-633/multiexp_affine.go | 42 +++++++------------ ecc/bw6-756/g1.go | 35 ++-------------- ecc/bw6-756/g1_test.go | 41 +++++++++--------- ecc/bw6-756/g2.go | 35 ++-------------- ecc/bw6-756/g2_test.go | 41 +++++++++--------- ecc/bw6-756/multiexp_affine.go | 42 +++++++------------ ecc/bw6-761/g1.go | 35 ++-------------- ecc/bw6-761/g1_test.go | 41 +++++++++--------- ecc/bw6-761/g2.go | 35 ++-------------- ecc/bw6-761/g2_test.go | 41 +++++++++--------- ecc/bw6-761/multiexp_affine.go | 42 +++++++------------ .../ecc/template/multiexp_affine.go.tmpl | 21 ++++------ internal/generator/ecc/template/point.go.tmpl | 35 ++-------------- .../ecc/template/tests/point.go.tmpl | 41 +++++++++--------- 48 files changed, 608 insertions(+), 1235 deletions(-) diff --git a/ecc/bls12-377/g1.go b/ecc/bls12-377/g1.go index 3602b14992..3be98b91a4 100644 --- a/ecc/bls12-377/g1.go +++ b/ecc/bls12-377/g1.go @@ -987,23 +987,17 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // R[:cptAdd], P[:cptAdd] contains points references to ADD // R[N-cptSub:], P[N-cptSub] contains points references to SUB // cptAdd + cptSub == batchSize, and batchSize may be smaller than N -func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { - batchSize := cptAdd + cptSub +func batchAddG1Affine(R []*G1Affine, P []G1Affine) { + batchSize := len(R) if batchSize == 0 { return } var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - j := 0 // add part - for j = 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { lambdain[j].Sub(&P[j].X, &R[j].X) } - // sub part - for i := len(R) - cptSub; i < len(R); i++ { - lambdain[j].Sub(&P[i].X, &R[i].X) - j++ - } // invert denominator batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) @@ -1012,7 +1006,7 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { var rr G1Affine // add part - for j := 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { // computa lambda d.Sub(&P[j].Y, &R[j].Y) lambda[j].Mul(&lambda[j], &d) @@ -1026,27 +1020,6 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } - - // middle of the input may be ignored if cptAdd + cptSub != len(R) - offset := len(R) - batchSize - - // sub part - for j := cptAdd; j < batchSize; j++ { - // computa lambda - idx := j + offset - d.Neg(&P[idx].Y) - d.Sub(&d, &R[idx].Y) - lambda[j].Mul(&lambda[j], &d) - - // compute X, Y - rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[idx].X) - rr.X.Sub(&rr.X, &P[idx].X) - d.Sub(&R[idx].X, &rr.X) - rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[idx].Y) - R[idx].Set(&rr) - } } // batch inversion diff --git a/ecc/bls12-377/g1_test.go b/ecc/bls12-377/g1_test.go index 105a2d0a9a..eb09d3cca4 100644 --- a/ecc/bls12-377/g1_test.go +++ b/ecc/bls12-377/g1_test.go @@ -19,7 +19,6 @@ package bls12377 import ( "fmt" "math/big" - "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-377/fp" @@ -500,32 +499,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } -func BenchmarkBatchAddG1Affine(b *testing.B) { - var P, R [MAX_BATCH_SIZE]G1Affine - var RR, PP [MAX_BATCH_SIZE]*G1Affine - var ridx [MAX_BATCH_SIZE]int +// func BenchmarkBatchAddG1Affine(b *testing.B) { +// var P, R [MAX_BATCH_SIZE]G1Affine +// var RR, PP [MAX_BATCH_SIZE]*G1Affine +// var ridx [MAX_BATCH_SIZE]int - fillBenchBasesG1(P[:]) - fillBenchBasesG1(R[:]) +// fillBenchBasesG1(P[:]) +// fillBenchBasesG1(R[:]) - for i := 0; i < len(ridx); i++ { - ridx[i] = i - } +// for i:=0; i < len(ridx);i++ { +// ridx[i] = i +// } - // random permute - rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) +// // random permute +// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) - for i, ri := range ridx { - RR[i] = &R[ri] - PP[i] = &P[ri] - } +// for i, ri := range ridx { +// RR[i] = &R[ri] +// PP[i] = &P[ri] +// } - b.ResetTimer() - for i := 0; i < b.N; i++ { - batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) - } +// b.ResetTimer() +// for i := 0; i < b.N; i++ { +// batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) +// } -} +// } func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bls12-377/g2.go b/ecc/bls12-377/g2.go index 04c0f5fac5..4b6f3de628 100644 --- a/ecc/bls12-377/g2.go +++ b/ecc/bls12-377/g2.go @@ -983,23 +983,17 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // R[:cptAdd], P[:cptAdd] contains points references to ADD // R[N-cptSub:], P[N-cptSub] contains points references to SUB // cptAdd + cptSub == batchSize, and batchSize may be smaller than N -func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { - batchSize := cptAdd + cptSub +func batchAddG2Affine(R []*G2Affine, P []G2Affine) { + batchSize := len(R) if batchSize == 0 { return } var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2 - j := 0 // add part - for j = 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { lambdain[j].Sub(&P[j].X, &R[j].X) } - // sub part - for i := len(R) - cptSub; i < len(R); i++ { - lambdain[j].Sub(&P[i].X, &R[i].X) - j++ - } // invert denominator batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) @@ -1008,7 +1002,7 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { var rr G2Affine // add part - for j := 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { // computa lambda d.Sub(&P[j].Y, &R[j].Y) lambda[j].Mul(&lambda[j], &d) @@ -1022,27 +1016,6 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } - - // middle of the input may be ignored if cptAdd + cptSub != len(R) - offset := len(R) - batchSize - - // sub part - for j := cptAdd; j < batchSize; j++ { - // computa lambda - idx := j + offset - d.Neg(&P[idx].Y) - d.Sub(&d, &R[idx].Y) - lambda[j].Mul(&lambda[j], &d) - - // compute X, Y - rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[idx].X) - rr.X.Sub(&rr.X, &P[idx].X) - d.Sub(&R[idx].X, &rr.X) - rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[idx].Y) - R[idx].Set(&rr) - } } // batch inversion diff --git a/ecc/bls12-377/g2_test.go b/ecc/bls12-377/g2_test.go index 9048e32439..c0653c32af 100644 --- a/ecc/bls12-377/g2_test.go +++ b/ecc/bls12-377/g2_test.go @@ -19,7 +19,6 @@ package bls12377 import ( "fmt" "math/big" - "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-377/internal/fptower" @@ -506,32 +505,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } -func BenchmarkBatchAddG2Affine(b *testing.B) { - var P, R [MAX_BATCH_SIZE]G2Affine - var RR, PP [MAX_BATCH_SIZE]*G2Affine - var ridx [MAX_BATCH_SIZE]int +// func BenchmarkBatchAddG2Affine(b *testing.B) { +// var P, R [MAX_BATCH_SIZE]G2Affine +// var RR, PP [MAX_BATCH_SIZE]*G2Affine +// var ridx [MAX_BATCH_SIZE]int - fillBenchBasesG2(P[:]) - fillBenchBasesG2(R[:]) +// fillBenchBasesG2(P[:]) +// fillBenchBasesG2(R[:]) - for i := 0; i < len(ridx); i++ { - ridx[i] = i - } +// for i:=0; i < len(ridx);i++ { +// ridx[i] = i +// } - // random permute - rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) +// // random permute +// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) - for i, ri := range ridx { - RR[i] = &R[ri] - PP[i] = &P[ri] - } +// for i, ri := range ridx { +// RR[i] = &R[ri] +// PP[i] = &P[ri] +// } - b.ResetTimer() - for i := 0; i < b.N; i++ { - batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) - } +// b.ResetTimer() +// for i := 0; i < b.N; i++ { +// batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) +// } -} +// } func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index c8b2686337..41c16a3afe 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -56,9 +56,8 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { @@ -66,19 +65,18 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return (cptAdd + cptSub) == batchSize + return (cptAdd) == batchSize } executeAndReset := func() { - if (cptAdd + cptSub) == 0 { + if (cptAdd) == 0 { return } - batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + batchAddG1Affine(R[:cptAdd], P[:cptAdd]) var tmp BS bucketIds = tmp cptAdd = 0 - cptSub = 0 } add := func(op batchOp) { @@ -122,16 +120,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } bucketIds[op.bucketID] = true + R[cptAdd] = BK if op.isNeg() { - cptSub++ - R[batchSize-cptSub] = BK - P[batchSize-cptSub] = PP + P[cptAdd].Neg(PP) } else { - R[cptAdd] = BK - P[cptAdd] = PP - cptAdd++ + P[cptAdd].Set(PP) } - + cptAdd++ } var queue [MAX_BATCH_SIZE]batchOp @@ -284,9 +279,8 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { @@ -294,19 +288,18 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return (cptAdd + cptSub) == batchSize + return (cptAdd) == batchSize } executeAndReset := func() { - if (cptAdd + cptSub) == 0 { + if (cptAdd) == 0 { return } - batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + batchAddG2Affine(R[:cptAdd], P[:cptAdd]) var tmp BS bucketIds = tmp cptAdd = 0 - cptSub = 0 } add := func(op batchOp) { @@ -350,16 +343,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } bucketIds[op.bucketID] = true + R[cptAdd] = BK if op.isNeg() { - cptSub++ - R[batchSize-cptSub] = BK - P[batchSize-cptSub] = PP + P[cptAdd].Neg(PP) } else { - R[cptAdd] = BK - P[cptAdd] = PP - cptAdd++ + P[cptAdd].Set(PP) } - + cptAdd++ } var queue [MAX_BATCH_SIZE]batchOp diff --git a/ecc/bls12-378/g1.go b/ecc/bls12-378/g1.go index d36be41445..1545108a66 100644 --- a/ecc/bls12-378/g1.go +++ b/ecc/bls12-378/g1.go @@ -987,23 +987,17 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // R[:cptAdd], P[:cptAdd] contains points references to ADD // R[N-cptSub:], P[N-cptSub] contains points references to SUB // cptAdd + cptSub == batchSize, and batchSize may be smaller than N -func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { - batchSize := cptAdd + cptSub +func batchAddG1Affine(R []*G1Affine, P []G1Affine) { + batchSize := len(R) if batchSize == 0 { return } var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - j := 0 // add part - for j = 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { lambdain[j].Sub(&P[j].X, &R[j].X) } - // sub part - for i := len(R) - cptSub; i < len(R); i++ { - lambdain[j].Sub(&P[i].X, &R[i].X) - j++ - } // invert denominator batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) @@ -1012,7 +1006,7 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { var rr G1Affine // add part - for j := 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { // computa lambda d.Sub(&P[j].Y, &R[j].Y) lambda[j].Mul(&lambda[j], &d) @@ -1026,27 +1020,6 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } - - // middle of the input may be ignored if cptAdd + cptSub != len(R) - offset := len(R) - batchSize - - // sub part - for j := cptAdd; j < batchSize; j++ { - // computa lambda - idx := j + offset - d.Neg(&P[idx].Y) - d.Sub(&d, &R[idx].Y) - lambda[j].Mul(&lambda[j], &d) - - // compute X, Y - rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[idx].X) - rr.X.Sub(&rr.X, &P[idx].X) - d.Sub(&R[idx].X, &rr.X) - rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[idx].Y) - R[idx].Set(&rr) - } } // batch inversion diff --git a/ecc/bls12-378/g1_test.go b/ecc/bls12-378/g1_test.go index a3603c49cb..6752818d29 100644 --- a/ecc/bls12-378/g1_test.go +++ b/ecc/bls12-378/g1_test.go @@ -19,7 +19,6 @@ package bls12378 import ( "fmt" "math/big" - "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-378/fp" @@ -500,32 +499,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } -func BenchmarkBatchAddG1Affine(b *testing.B) { - var P, R [MAX_BATCH_SIZE]G1Affine - var RR, PP [MAX_BATCH_SIZE]*G1Affine - var ridx [MAX_BATCH_SIZE]int +// func BenchmarkBatchAddG1Affine(b *testing.B) { +// var P, R [MAX_BATCH_SIZE]G1Affine +// var RR, PP [MAX_BATCH_SIZE]*G1Affine +// var ridx [MAX_BATCH_SIZE]int - fillBenchBasesG1(P[:]) - fillBenchBasesG1(R[:]) +// fillBenchBasesG1(P[:]) +// fillBenchBasesG1(R[:]) - for i := 0; i < len(ridx); i++ { - ridx[i] = i - } +// for i:=0; i < len(ridx);i++ { +// ridx[i] = i +// } - // random permute - rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) +// // random permute +// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) - for i, ri := range ridx { - RR[i] = &R[ri] - PP[i] = &P[ri] - } +// for i, ri := range ridx { +// RR[i] = &R[ri] +// PP[i] = &P[ri] +// } - b.ResetTimer() - for i := 0; i < b.N; i++ { - batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) - } +// b.ResetTimer() +// for i := 0; i < b.N; i++ { +// batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) +// } -} +// } func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bls12-378/g2.go b/ecc/bls12-378/g2.go index 9803f61512..26aaa42624 100644 --- a/ecc/bls12-378/g2.go +++ b/ecc/bls12-378/g2.go @@ -983,23 +983,17 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // R[:cptAdd], P[:cptAdd] contains points references to ADD // R[N-cptSub:], P[N-cptSub] contains points references to SUB // cptAdd + cptSub == batchSize, and batchSize may be smaller than N -func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { - batchSize := cptAdd + cptSub +func batchAddG2Affine(R []*G2Affine, P []G2Affine) { + batchSize := len(R) if batchSize == 0 { return } var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2 - j := 0 // add part - for j = 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { lambdain[j].Sub(&P[j].X, &R[j].X) } - // sub part - for i := len(R) - cptSub; i < len(R); i++ { - lambdain[j].Sub(&P[i].X, &R[i].X) - j++ - } // invert denominator batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) @@ -1008,7 +1002,7 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { var rr G2Affine // add part - for j := 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { // computa lambda d.Sub(&P[j].Y, &R[j].Y) lambda[j].Mul(&lambda[j], &d) @@ -1022,27 +1016,6 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } - - // middle of the input may be ignored if cptAdd + cptSub != len(R) - offset := len(R) - batchSize - - // sub part - for j := cptAdd; j < batchSize; j++ { - // computa lambda - idx := j + offset - d.Neg(&P[idx].Y) - d.Sub(&d, &R[idx].Y) - lambda[j].Mul(&lambda[j], &d) - - // compute X, Y - rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[idx].X) - rr.X.Sub(&rr.X, &P[idx].X) - d.Sub(&R[idx].X, &rr.X) - rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[idx].Y) - R[idx].Set(&rr) - } } // batch inversion diff --git a/ecc/bls12-378/g2_test.go b/ecc/bls12-378/g2_test.go index ffe94dbc2e..a9632dc413 100644 --- a/ecc/bls12-378/g2_test.go +++ b/ecc/bls12-378/g2_test.go @@ -19,7 +19,6 @@ package bls12378 import ( "fmt" "math/big" - "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-378/internal/fptower" @@ -506,32 +505,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } -func BenchmarkBatchAddG2Affine(b *testing.B) { - var P, R [MAX_BATCH_SIZE]G2Affine - var RR, PP [MAX_BATCH_SIZE]*G2Affine - var ridx [MAX_BATCH_SIZE]int +// func BenchmarkBatchAddG2Affine(b *testing.B) { +// var P, R [MAX_BATCH_SIZE]G2Affine +// var RR, PP [MAX_BATCH_SIZE]*G2Affine +// var ridx [MAX_BATCH_SIZE]int - fillBenchBasesG2(P[:]) - fillBenchBasesG2(R[:]) +// fillBenchBasesG2(P[:]) +// fillBenchBasesG2(R[:]) - for i := 0; i < len(ridx); i++ { - ridx[i] = i - } +// for i:=0; i < len(ridx);i++ { +// ridx[i] = i +// } - // random permute - rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) +// // random permute +// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) - for i, ri := range ridx { - RR[i] = &R[ri] - PP[i] = &P[ri] - } +// for i, ri := range ridx { +// RR[i] = &R[ri] +// PP[i] = &P[ri] +// } - b.ResetTimer() - for i := 0; i < b.N; i++ { - batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) - } +// b.ResetTimer() +// for i := 0; i < b.N; i++ { +// batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) +// } -} +// } func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index b30717ffea..95eb76b3ac 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -56,9 +56,8 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { @@ -66,19 +65,18 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return (cptAdd + cptSub) == batchSize + return (cptAdd) == batchSize } executeAndReset := func() { - if (cptAdd + cptSub) == 0 { + if (cptAdd) == 0 { return } - batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + batchAddG1Affine(R[:cptAdd], P[:cptAdd]) var tmp BS bucketIds = tmp cptAdd = 0 - cptSub = 0 } add := func(op batchOp) { @@ -122,16 +120,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } bucketIds[op.bucketID] = true + R[cptAdd] = BK if op.isNeg() { - cptSub++ - R[batchSize-cptSub] = BK - P[batchSize-cptSub] = PP + P[cptAdd].Neg(PP) } else { - R[cptAdd] = BK - P[cptAdd] = PP - cptAdd++ + P[cptAdd].Set(PP) } - + cptAdd++ } var queue [MAX_BATCH_SIZE]batchOp @@ -284,9 +279,8 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { @@ -294,19 +288,18 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return (cptAdd + cptSub) == batchSize + return (cptAdd) == batchSize } executeAndReset := func() { - if (cptAdd + cptSub) == 0 { + if (cptAdd) == 0 { return } - batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + batchAddG2Affine(R[:cptAdd], P[:cptAdd]) var tmp BS bucketIds = tmp cptAdd = 0 - cptSub = 0 } add := func(op batchOp) { @@ -350,16 +343,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } bucketIds[op.bucketID] = true + R[cptAdd] = BK if op.isNeg() { - cptSub++ - R[batchSize-cptSub] = BK - P[batchSize-cptSub] = PP + P[cptAdd].Neg(PP) } else { - R[cptAdd] = BK - P[cptAdd] = PP - cptAdd++ + P[cptAdd].Set(PP) } - + cptAdd++ } var queue [MAX_BATCH_SIZE]batchOp diff --git a/ecc/bls12-381/g1.go b/ecc/bls12-381/g1.go index b05d04acc8..5a59011791 100644 --- a/ecc/bls12-381/g1.go +++ b/ecc/bls12-381/g1.go @@ -987,23 +987,17 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // R[:cptAdd], P[:cptAdd] contains points references to ADD // R[N-cptSub:], P[N-cptSub] contains points references to SUB // cptAdd + cptSub == batchSize, and batchSize may be smaller than N -func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { - batchSize := cptAdd + cptSub +func batchAddG1Affine(R []*G1Affine, P []G1Affine) { + batchSize := len(R) if batchSize == 0 { return } var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - j := 0 // add part - for j = 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { lambdain[j].Sub(&P[j].X, &R[j].X) } - // sub part - for i := len(R) - cptSub; i < len(R); i++ { - lambdain[j].Sub(&P[i].X, &R[i].X) - j++ - } // invert denominator batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) @@ -1012,7 +1006,7 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { var rr G1Affine // add part - for j := 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { // computa lambda d.Sub(&P[j].Y, &R[j].Y) lambda[j].Mul(&lambda[j], &d) @@ -1026,27 +1020,6 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } - - // middle of the input may be ignored if cptAdd + cptSub != len(R) - offset := len(R) - batchSize - - // sub part - for j := cptAdd; j < batchSize; j++ { - // computa lambda - idx := j + offset - d.Neg(&P[idx].Y) - d.Sub(&d, &R[idx].Y) - lambda[j].Mul(&lambda[j], &d) - - // compute X, Y - rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[idx].X) - rr.X.Sub(&rr.X, &P[idx].X) - d.Sub(&R[idx].X, &rr.X) - rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[idx].Y) - R[idx].Set(&rr) - } } // batch inversion diff --git a/ecc/bls12-381/g1_test.go b/ecc/bls12-381/g1_test.go index 68a84cf073..223c3763c0 100644 --- a/ecc/bls12-381/g1_test.go +++ b/ecc/bls12-381/g1_test.go @@ -19,7 +19,6 @@ package bls12381 import ( "fmt" "math/big" - "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-381/fp" @@ -500,32 +499,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } -func BenchmarkBatchAddG1Affine(b *testing.B) { - var P, R [MAX_BATCH_SIZE]G1Affine - var RR, PP [MAX_BATCH_SIZE]*G1Affine - var ridx [MAX_BATCH_SIZE]int +// func BenchmarkBatchAddG1Affine(b *testing.B) { +// var P, R [MAX_BATCH_SIZE]G1Affine +// var RR, PP [MAX_BATCH_SIZE]*G1Affine +// var ridx [MAX_BATCH_SIZE]int - fillBenchBasesG1(P[:]) - fillBenchBasesG1(R[:]) +// fillBenchBasesG1(P[:]) +// fillBenchBasesG1(R[:]) - for i := 0; i < len(ridx); i++ { - ridx[i] = i - } +// for i:=0; i < len(ridx);i++ { +// ridx[i] = i +// } - // random permute - rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) +// // random permute +// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) - for i, ri := range ridx { - RR[i] = &R[ri] - PP[i] = &P[ri] - } +// for i, ri := range ridx { +// RR[i] = &R[ri] +// PP[i] = &P[ri] +// } - b.ResetTimer() - for i := 0; i < b.N; i++ { - batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) - } +// b.ResetTimer() +// for i := 0; i < b.N; i++ { +// batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) +// } -} +// } func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bls12-381/g2.go b/ecc/bls12-381/g2.go index c69c7f0444..6b7dfa5639 100644 --- a/ecc/bls12-381/g2.go +++ b/ecc/bls12-381/g2.go @@ -984,23 +984,17 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // R[:cptAdd], P[:cptAdd] contains points references to ADD // R[N-cptSub:], P[N-cptSub] contains points references to SUB // cptAdd + cptSub == batchSize, and batchSize may be smaller than N -func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { - batchSize := cptAdd + cptSub +func batchAddG2Affine(R []*G2Affine, P []G2Affine) { + batchSize := len(R) if batchSize == 0 { return } var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2 - j := 0 // add part - for j = 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { lambdain[j].Sub(&P[j].X, &R[j].X) } - // sub part - for i := len(R) - cptSub; i < len(R); i++ { - lambdain[j].Sub(&P[i].X, &R[i].X) - j++ - } // invert denominator batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) @@ -1009,7 +1003,7 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { var rr G2Affine // add part - for j := 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { // computa lambda d.Sub(&P[j].Y, &R[j].Y) lambda[j].Mul(&lambda[j], &d) @@ -1023,27 +1017,6 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } - - // middle of the input may be ignored if cptAdd + cptSub != len(R) - offset := len(R) - batchSize - - // sub part - for j := cptAdd; j < batchSize; j++ { - // computa lambda - idx := j + offset - d.Neg(&P[idx].Y) - d.Sub(&d, &R[idx].Y) - lambda[j].Mul(&lambda[j], &d) - - // compute X, Y - rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[idx].X) - rr.X.Sub(&rr.X, &P[idx].X) - d.Sub(&R[idx].X, &rr.X) - rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[idx].Y) - R[idx].Set(&rr) - } } // batch inversion diff --git a/ecc/bls12-381/g2_test.go b/ecc/bls12-381/g2_test.go index 129a541689..be4957738e 100644 --- a/ecc/bls12-381/g2_test.go +++ b/ecc/bls12-381/g2_test.go @@ -19,7 +19,6 @@ package bls12381 import ( "fmt" "math/big" - "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-381/internal/fptower" @@ -506,32 +505,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } -func BenchmarkBatchAddG2Affine(b *testing.B) { - var P, R [MAX_BATCH_SIZE]G2Affine - var RR, PP [MAX_BATCH_SIZE]*G2Affine - var ridx [MAX_BATCH_SIZE]int +// func BenchmarkBatchAddG2Affine(b *testing.B) { +// var P, R [MAX_BATCH_SIZE]G2Affine +// var RR, PP [MAX_BATCH_SIZE]*G2Affine +// var ridx [MAX_BATCH_SIZE]int - fillBenchBasesG2(P[:]) - fillBenchBasesG2(R[:]) +// fillBenchBasesG2(P[:]) +// fillBenchBasesG2(R[:]) - for i := 0; i < len(ridx); i++ { - ridx[i] = i - } +// for i:=0; i < len(ridx);i++ { +// ridx[i] = i +// } - // random permute - rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) +// // random permute +// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) - for i, ri := range ridx { - RR[i] = &R[ri] - PP[i] = &P[ri] - } +// for i, ri := range ridx { +// RR[i] = &R[ri] +// PP[i] = &P[ri] +// } - b.ResetTimer() - for i := 0; i < b.N; i++ { - batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) - } +// b.ResetTimer() +// for i := 0; i < b.N; i++ { +// batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) +// } -} +// } func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index a2a7eb8ffb..5a51ee46b6 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -56,9 +56,8 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { @@ -66,19 +65,18 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return (cptAdd + cptSub) == batchSize + return (cptAdd) == batchSize } executeAndReset := func() { - if (cptAdd + cptSub) == 0 { + if (cptAdd) == 0 { return } - batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + batchAddG1Affine(R[:cptAdd], P[:cptAdd]) var tmp BS bucketIds = tmp cptAdd = 0 - cptSub = 0 } add := func(op batchOp) { @@ -122,16 +120,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } bucketIds[op.bucketID] = true + R[cptAdd] = BK if op.isNeg() { - cptSub++ - R[batchSize-cptSub] = BK - P[batchSize-cptSub] = PP + P[cptAdd].Neg(PP) } else { - R[cptAdd] = BK - P[cptAdd] = PP - cptAdd++ + P[cptAdd].Set(PP) } - + cptAdd++ } var queue [MAX_BATCH_SIZE]batchOp @@ -284,9 +279,8 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { @@ -294,19 +288,18 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return (cptAdd + cptSub) == batchSize + return (cptAdd) == batchSize } executeAndReset := func() { - if (cptAdd + cptSub) == 0 { + if (cptAdd) == 0 { return } - batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + batchAddG2Affine(R[:cptAdd], P[:cptAdd]) var tmp BS bucketIds = tmp cptAdd = 0 - cptSub = 0 } add := func(op batchOp) { @@ -350,16 +343,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } bucketIds[op.bucketID] = true + R[cptAdd] = BK if op.isNeg() { - cptSub++ - R[batchSize-cptSub] = BK - P[batchSize-cptSub] = PP + P[cptAdd].Neg(PP) } else { - R[cptAdd] = BK - P[cptAdd] = PP - cptAdd++ + P[cptAdd].Set(PP) } - + cptAdd++ } var queue [MAX_BATCH_SIZE]batchOp diff --git a/ecc/bls24-315/g1.go b/ecc/bls24-315/g1.go index 86e394a710..cd0d0a8a69 100644 --- a/ecc/bls24-315/g1.go +++ b/ecc/bls24-315/g1.go @@ -989,23 +989,17 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // R[:cptAdd], P[:cptAdd] contains points references to ADD // R[N-cptSub:], P[N-cptSub] contains points references to SUB // cptAdd + cptSub == batchSize, and batchSize may be smaller than N -func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { - batchSize := cptAdd + cptSub +func batchAddG1Affine(R []*G1Affine, P []G1Affine) { + batchSize := len(R) if batchSize == 0 { return } var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - j := 0 // add part - for j = 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { lambdain[j].Sub(&P[j].X, &R[j].X) } - // sub part - for i := len(R) - cptSub; i < len(R); i++ { - lambdain[j].Sub(&P[i].X, &R[i].X) - j++ - } // invert denominator batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) @@ -1014,7 +1008,7 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { var rr G1Affine // add part - for j := 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { // computa lambda d.Sub(&P[j].Y, &R[j].Y) lambda[j].Mul(&lambda[j], &d) @@ -1028,27 +1022,6 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } - - // middle of the input may be ignored if cptAdd + cptSub != len(R) - offset := len(R) - batchSize - - // sub part - for j := cptAdd; j < batchSize; j++ { - // computa lambda - idx := j + offset - d.Neg(&P[idx].Y) - d.Sub(&d, &R[idx].Y) - lambda[j].Mul(&lambda[j], &d) - - // compute X, Y - rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[idx].X) - rr.X.Sub(&rr.X, &P[idx].X) - d.Sub(&R[idx].X, &rr.X) - rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[idx].Y) - R[idx].Set(&rr) - } } // batch inversion diff --git a/ecc/bls24-315/g1_test.go b/ecc/bls24-315/g1_test.go index d3840c2537..4ffe3679c7 100644 --- a/ecc/bls24-315/g1_test.go +++ b/ecc/bls24-315/g1_test.go @@ -19,7 +19,6 @@ package bls24315 import ( "fmt" "math/big" - "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls24-315/fp" @@ -500,32 +499,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } -func BenchmarkBatchAddG1Affine(b *testing.B) { - var P, R [MAX_BATCH_SIZE]G1Affine - var RR, PP [MAX_BATCH_SIZE]*G1Affine - var ridx [MAX_BATCH_SIZE]int +// func BenchmarkBatchAddG1Affine(b *testing.B) { +// var P, R [MAX_BATCH_SIZE]G1Affine +// var RR, PP [MAX_BATCH_SIZE]*G1Affine +// var ridx [MAX_BATCH_SIZE]int - fillBenchBasesG1(P[:]) - fillBenchBasesG1(R[:]) +// fillBenchBasesG1(P[:]) +// fillBenchBasesG1(R[:]) - for i := 0; i < len(ridx); i++ { - ridx[i] = i - } +// for i:=0; i < len(ridx);i++ { +// ridx[i] = i +// } - // random permute - rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) +// // random permute +// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) - for i, ri := range ridx { - RR[i] = &R[ri] - PP[i] = &P[ri] - } +// for i, ri := range ridx { +// RR[i] = &R[ri] +// PP[i] = &P[ri] +// } - b.ResetTimer() - for i := 0; i < b.N; i++ { - batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) - } +// b.ResetTimer() +// for i := 0; i < b.N; i++ { +// batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) +// } -} +// } func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bls24-315/g2.go b/ecc/bls24-315/g2.go index 6170c188b6..7fa2e026c3 100644 --- a/ecc/bls24-315/g2.go +++ b/ecc/bls24-315/g2.go @@ -999,23 +999,17 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // R[:cptAdd], P[:cptAdd] contains points references to ADD // R[N-cptSub:], P[N-cptSub] contains points references to SUB // cptAdd + cptSub == batchSize, and batchSize may be smaller than N -func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { - batchSize := cptAdd + cptSub +func batchAddG2Affine(R []*G2Affine, P []G2Affine) { + batchSize := len(R) if batchSize == 0 { return } var lambda, lambdain [MAX_BATCH_SIZE]fptower.E4 - j := 0 // add part - for j = 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { lambdain[j].Sub(&P[j].X, &R[j].X) } - // sub part - for i := len(R) - cptSub; i < len(R); i++ { - lambdain[j].Sub(&P[i].X, &R[i].X) - j++ - } // invert denominator batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) @@ -1024,7 +1018,7 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { var rr G2Affine // add part - for j := 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { // computa lambda d.Sub(&P[j].Y, &R[j].Y) lambda[j].Mul(&lambda[j], &d) @@ -1038,27 +1032,6 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } - - // middle of the input may be ignored if cptAdd + cptSub != len(R) - offset := len(R) - batchSize - - // sub part - for j := cptAdd; j < batchSize; j++ { - // computa lambda - idx := j + offset - d.Neg(&P[idx].Y) - d.Sub(&d, &R[idx].Y) - lambda[j].Mul(&lambda[j], &d) - - // compute X, Y - rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[idx].X) - rr.X.Sub(&rr.X, &P[idx].X) - d.Sub(&R[idx].X, &rr.X) - rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[idx].Y) - R[idx].Set(&rr) - } } // batch inversion diff --git a/ecc/bls24-315/g2_test.go b/ecc/bls24-315/g2_test.go index 2e97c208c0..019fa5ec24 100644 --- a/ecc/bls24-315/g2_test.go +++ b/ecc/bls24-315/g2_test.go @@ -19,7 +19,6 @@ package bls24315 import ( "fmt" "math/big" - "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls24-315/internal/fptower" @@ -506,32 +505,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } -func BenchmarkBatchAddG2Affine(b *testing.B) { - var P, R [MAX_BATCH_SIZE]G2Affine - var RR, PP [MAX_BATCH_SIZE]*G2Affine - var ridx [MAX_BATCH_SIZE]int +// func BenchmarkBatchAddG2Affine(b *testing.B) { +// var P, R [MAX_BATCH_SIZE]G2Affine +// var RR, PP [MAX_BATCH_SIZE]*G2Affine +// var ridx [MAX_BATCH_SIZE]int - fillBenchBasesG2(P[:]) - fillBenchBasesG2(R[:]) +// fillBenchBasesG2(P[:]) +// fillBenchBasesG2(R[:]) - for i := 0; i < len(ridx); i++ { - ridx[i] = i - } +// for i:=0; i < len(ridx);i++ { +// ridx[i] = i +// } - // random permute - rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) +// // random permute +// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) - for i, ri := range ridx { - RR[i] = &R[ri] - PP[i] = &P[ri] - } +// for i, ri := range ridx { +// RR[i] = &R[ri] +// PP[i] = &P[ri] +// } - b.ResetTimer() - for i := 0; i < b.N; i++ { - batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) - } +// b.ResetTimer() +// for i := 0; i < b.N; i++ { +// batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) +// } -} +// } func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index 5db551830d..3714629530 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -56,9 +56,8 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { @@ -66,19 +65,18 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return (cptAdd + cptSub) == batchSize + return (cptAdd) == batchSize } executeAndReset := func() { - if (cptAdd + cptSub) == 0 { + if (cptAdd) == 0 { return } - batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + batchAddG1Affine(R[:cptAdd], P[:cptAdd]) var tmp BS bucketIds = tmp cptAdd = 0 - cptSub = 0 } add := func(op batchOp) { @@ -122,16 +120,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } bucketIds[op.bucketID] = true + R[cptAdd] = BK if op.isNeg() { - cptSub++ - R[batchSize-cptSub] = BK - P[batchSize-cptSub] = PP + P[cptAdd].Neg(PP) } else { - R[cptAdd] = BK - P[cptAdd] = PP - cptAdd++ + P[cptAdd].Set(PP) } - + cptAdd++ } var queue [MAX_BATCH_SIZE]batchOp @@ -284,9 +279,8 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { @@ -294,19 +288,18 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return (cptAdd + cptSub) == batchSize + return (cptAdd) == batchSize } executeAndReset := func() { - if (cptAdd + cptSub) == 0 { + if (cptAdd) == 0 { return } - batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + batchAddG2Affine(R[:cptAdd], P[:cptAdd]) var tmp BS bucketIds = tmp cptAdd = 0 - cptSub = 0 } add := func(op batchOp) { @@ -350,16 +343,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } bucketIds[op.bucketID] = true + R[cptAdd] = BK if op.isNeg() { - cptSub++ - R[batchSize-cptSub] = BK - P[batchSize-cptSub] = PP + P[cptAdd].Neg(PP) } else { - R[cptAdd] = BK - P[cptAdd] = PP - cptAdd++ + P[cptAdd].Set(PP) } - + cptAdd++ } var queue [MAX_BATCH_SIZE]batchOp diff --git a/ecc/bls24-317/g1.go b/ecc/bls24-317/g1.go index 1d4e27c062..4a28082836 100644 --- a/ecc/bls24-317/g1.go +++ b/ecc/bls24-317/g1.go @@ -989,23 +989,17 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // R[:cptAdd], P[:cptAdd] contains points references to ADD // R[N-cptSub:], P[N-cptSub] contains points references to SUB // cptAdd + cptSub == batchSize, and batchSize may be smaller than N -func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { - batchSize := cptAdd + cptSub +func batchAddG1Affine(R []*G1Affine, P []G1Affine) { + batchSize := len(R) if batchSize == 0 { return } var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - j := 0 // add part - for j = 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { lambdain[j].Sub(&P[j].X, &R[j].X) } - // sub part - for i := len(R) - cptSub; i < len(R); i++ { - lambdain[j].Sub(&P[i].X, &R[i].X) - j++ - } // invert denominator batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) @@ -1014,7 +1008,7 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { var rr G1Affine // add part - for j := 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { // computa lambda d.Sub(&P[j].Y, &R[j].Y) lambda[j].Mul(&lambda[j], &d) @@ -1028,27 +1022,6 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } - - // middle of the input may be ignored if cptAdd + cptSub != len(R) - offset := len(R) - batchSize - - // sub part - for j := cptAdd; j < batchSize; j++ { - // computa lambda - idx := j + offset - d.Neg(&P[idx].Y) - d.Sub(&d, &R[idx].Y) - lambda[j].Mul(&lambda[j], &d) - - // compute X, Y - rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[idx].X) - rr.X.Sub(&rr.X, &P[idx].X) - d.Sub(&R[idx].X, &rr.X) - rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[idx].Y) - R[idx].Set(&rr) - } } // batch inversion diff --git a/ecc/bls24-317/g1_test.go b/ecc/bls24-317/g1_test.go index 59fd0c425f..3a89f924e5 100644 --- a/ecc/bls24-317/g1_test.go +++ b/ecc/bls24-317/g1_test.go @@ -19,7 +19,6 @@ package bls24317 import ( "fmt" "math/big" - "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls24-317/fp" @@ -500,32 +499,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } -func BenchmarkBatchAddG1Affine(b *testing.B) { - var P, R [MAX_BATCH_SIZE]G1Affine - var RR, PP [MAX_BATCH_SIZE]*G1Affine - var ridx [MAX_BATCH_SIZE]int +// func BenchmarkBatchAddG1Affine(b *testing.B) { +// var P, R [MAX_BATCH_SIZE]G1Affine +// var RR, PP [MAX_BATCH_SIZE]*G1Affine +// var ridx [MAX_BATCH_SIZE]int - fillBenchBasesG1(P[:]) - fillBenchBasesG1(R[:]) +// fillBenchBasesG1(P[:]) +// fillBenchBasesG1(R[:]) - for i := 0; i < len(ridx); i++ { - ridx[i] = i - } +// for i:=0; i < len(ridx);i++ { +// ridx[i] = i +// } - // random permute - rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) +// // random permute +// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) - for i, ri := range ridx { - RR[i] = &R[ri] - PP[i] = &P[ri] - } +// for i, ri := range ridx { +// RR[i] = &R[ri] +// PP[i] = &P[ri] +// } - b.ResetTimer() - for i := 0; i < b.N; i++ { - batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) - } +// b.ResetTimer() +// for i := 0; i < b.N; i++ { +// batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) +// } -} +// } func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bls24-317/g2.go b/ecc/bls24-317/g2.go index bbfcfd12b2..acd1176cdd 100644 --- a/ecc/bls24-317/g2.go +++ b/ecc/bls24-317/g2.go @@ -999,23 +999,17 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // R[:cptAdd], P[:cptAdd] contains points references to ADD // R[N-cptSub:], P[N-cptSub] contains points references to SUB // cptAdd + cptSub == batchSize, and batchSize may be smaller than N -func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { - batchSize := cptAdd + cptSub +func batchAddG2Affine(R []*G2Affine, P []G2Affine) { + batchSize := len(R) if batchSize == 0 { return } var lambda, lambdain [MAX_BATCH_SIZE]fptower.E4 - j := 0 // add part - for j = 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { lambdain[j].Sub(&P[j].X, &R[j].X) } - // sub part - for i := len(R) - cptSub; i < len(R); i++ { - lambdain[j].Sub(&P[i].X, &R[i].X) - j++ - } // invert denominator batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) @@ -1024,7 +1018,7 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { var rr G2Affine // add part - for j := 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { // computa lambda d.Sub(&P[j].Y, &R[j].Y) lambda[j].Mul(&lambda[j], &d) @@ -1038,27 +1032,6 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } - - // middle of the input may be ignored if cptAdd + cptSub != len(R) - offset := len(R) - batchSize - - // sub part - for j := cptAdd; j < batchSize; j++ { - // computa lambda - idx := j + offset - d.Neg(&P[idx].Y) - d.Sub(&d, &R[idx].Y) - lambda[j].Mul(&lambda[j], &d) - - // compute X, Y - rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[idx].X) - rr.X.Sub(&rr.X, &P[idx].X) - d.Sub(&R[idx].X, &rr.X) - rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[idx].Y) - R[idx].Set(&rr) - } } // batch inversion diff --git a/ecc/bls24-317/g2_test.go b/ecc/bls24-317/g2_test.go index f02c85f79d..1d7ed1f3ff 100644 --- a/ecc/bls24-317/g2_test.go +++ b/ecc/bls24-317/g2_test.go @@ -19,7 +19,6 @@ package bls24317 import ( "fmt" "math/big" - "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls24-317/internal/fptower" @@ -506,32 +505,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } -func BenchmarkBatchAddG2Affine(b *testing.B) { - var P, R [MAX_BATCH_SIZE]G2Affine - var RR, PP [MAX_BATCH_SIZE]*G2Affine - var ridx [MAX_BATCH_SIZE]int +// func BenchmarkBatchAddG2Affine(b *testing.B) { +// var P, R [MAX_BATCH_SIZE]G2Affine +// var RR, PP [MAX_BATCH_SIZE]*G2Affine +// var ridx [MAX_BATCH_SIZE]int - fillBenchBasesG2(P[:]) - fillBenchBasesG2(R[:]) +// fillBenchBasesG2(P[:]) +// fillBenchBasesG2(R[:]) - for i := 0; i < len(ridx); i++ { - ridx[i] = i - } +// for i:=0; i < len(ridx);i++ { +// ridx[i] = i +// } - // random permute - rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) +// // random permute +// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) - for i, ri := range ridx { - RR[i] = &R[ri] - PP[i] = &P[ri] - } +// for i, ri := range ridx { +// RR[i] = &R[ri] +// PP[i] = &P[ri] +// } - b.ResetTimer() - for i := 0; i < b.N; i++ { - batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) - } +// b.ResetTimer() +// for i := 0; i < b.N; i++ { +// batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) +// } -} +// } func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index b517f1ef32..789e3e18be 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -56,9 +56,8 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { @@ -66,19 +65,18 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return (cptAdd + cptSub) == batchSize + return (cptAdd) == batchSize } executeAndReset := func() { - if (cptAdd + cptSub) == 0 { + if (cptAdd) == 0 { return } - batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + batchAddG1Affine(R[:cptAdd], P[:cptAdd]) var tmp BS bucketIds = tmp cptAdd = 0 - cptSub = 0 } add := func(op batchOp) { @@ -122,16 +120,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } bucketIds[op.bucketID] = true + R[cptAdd] = BK if op.isNeg() { - cptSub++ - R[batchSize-cptSub] = BK - P[batchSize-cptSub] = PP + P[cptAdd].Neg(PP) } else { - R[cptAdd] = BK - P[cptAdd] = PP - cptAdd++ + P[cptAdd].Set(PP) } - + cptAdd++ } var queue [MAX_BATCH_SIZE]batchOp @@ -284,9 +279,8 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { @@ -294,19 +288,18 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return (cptAdd + cptSub) == batchSize + return (cptAdd) == batchSize } executeAndReset := func() { - if (cptAdd + cptSub) == 0 { + if (cptAdd) == 0 { return } - batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + batchAddG2Affine(R[:cptAdd], P[:cptAdd]) var tmp BS bucketIds = tmp cptAdd = 0 - cptSub = 0 } add := func(op batchOp) { @@ -350,16 +343,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } bucketIds[op.bucketID] = true + R[cptAdd] = BK if op.isNeg() { - cptSub++ - R[batchSize-cptSub] = BK - P[batchSize-cptSub] = PP + P[cptAdd].Neg(PP) } else { - R[cptAdd] = BK - P[cptAdd] = PP - cptAdd++ + P[cptAdd].Set(PP) } - + cptAdd++ } var queue [MAX_BATCH_SIZE]batchOp diff --git a/ecc/bn254/g1.go b/ecc/bn254/g1.go index 6f9a4d4e59..84e72c7c5f 100644 --- a/ecc/bn254/g1.go +++ b/ecc/bn254/g1.go @@ -959,23 +959,17 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // R[:cptAdd], P[:cptAdd] contains points references to ADD // R[N-cptSub:], P[N-cptSub] contains points references to SUB // cptAdd + cptSub == batchSize, and batchSize may be smaller than N -func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { - batchSize := cptAdd + cptSub +func batchAddG1Affine(R []*G1Affine, P []G1Affine) { + batchSize := len(R) if batchSize == 0 { return } var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - j := 0 // add part - for j = 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { lambdain[j].Sub(&P[j].X, &R[j].X) } - // sub part - for i := len(R) - cptSub; i < len(R); i++ { - lambdain[j].Sub(&P[i].X, &R[i].X) - j++ - } // invert denominator batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) @@ -984,7 +978,7 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { var rr G1Affine // add part - for j := 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { // computa lambda d.Sub(&P[j].Y, &R[j].Y) lambda[j].Mul(&lambda[j], &d) @@ -998,27 +992,6 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } - - // middle of the input may be ignored if cptAdd + cptSub != len(R) - offset := len(R) - batchSize - - // sub part - for j := cptAdd; j < batchSize; j++ { - // computa lambda - idx := j + offset - d.Neg(&P[idx].Y) - d.Sub(&d, &R[idx].Y) - lambda[j].Mul(&lambda[j], &d) - - // compute X, Y - rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[idx].X) - rr.X.Sub(&rr.X, &P[idx].X) - d.Sub(&R[idx].X, &rr.X) - rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[idx].Y) - R[idx].Set(&rr) - } } // batch inversion diff --git a/ecc/bn254/g1_test.go b/ecc/bn254/g1_test.go index ffc6160b01..2e1973a911 100644 --- a/ecc/bn254/g1_test.go +++ b/ecc/bn254/g1_test.go @@ -19,7 +19,6 @@ package bn254 import ( "fmt" "math/big" - "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bn254/fp" @@ -461,32 +460,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } -func BenchmarkBatchAddG1Affine(b *testing.B) { - var P, R [MAX_BATCH_SIZE]G1Affine - var RR, PP [MAX_BATCH_SIZE]*G1Affine - var ridx [MAX_BATCH_SIZE]int +// func BenchmarkBatchAddG1Affine(b *testing.B) { +// var P, R [MAX_BATCH_SIZE]G1Affine +// var RR, PP [MAX_BATCH_SIZE]*G1Affine +// var ridx [MAX_BATCH_SIZE]int - fillBenchBasesG1(P[:]) - fillBenchBasesG1(R[:]) +// fillBenchBasesG1(P[:]) +// fillBenchBasesG1(R[:]) - for i := 0; i < len(ridx); i++ { - ridx[i] = i - } +// for i:=0; i < len(ridx);i++ { +// ridx[i] = i +// } - // random permute - rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) +// // random permute +// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) - for i, ri := range ridx { - RR[i] = &R[ri] - PP[i] = &P[ri] - } +// for i, ri := range ridx { +// RR[i] = &R[ri] +// PP[i] = &P[ri] +// } - b.ResetTimer() - for i := 0; i < b.N; i++ { - batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) - } +// b.ResetTimer() +// for i := 0; i < b.N; i++ { +// batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) +// } -} +// } func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bn254/g2.go b/ecc/bn254/g2.go index 762a6f944b..2bef2cba4c 100644 --- a/ecc/bn254/g2.go +++ b/ecc/bn254/g2.go @@ -988,23 +988,17 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // R[:cptAdd], P[:cptAdd] contains points references to ADD // R[N-cptSub:], P[N-cptSub] contains points references to SUB // cptAdd + cptSub == batchSize, and batchSize may be smaller than N -func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { - batchSize := cptAdd + cptSub +func batchAddG2Affine(R []*G2Affine, P []G2Affine) { + batchSize := len(R) if batchSize == 0 { return } var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2 - j := 0 // add part - for j = 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { lambdain[j].Sub(&P[j].X, &R[j].X) } - // sub part - for i := len(R) - cptSub; i < len(R); i++ { - lambdain[j].Sub(&P[i].X, &R[i].X) - j++ - } // invert denominator batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) @@ -1013,7 +1007,7 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { var rr G2Affine // add part - for j := 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { // computa lambda d.Sub(&P[j].Y, &R[j].Y) lambda[j].Mul(&lambda[j], &d) @@ -1027,27 +1021,6 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } - - // middle of the input may be ignored if cptAdd + cptSub != len(R) - offset := len(R) - batchSize - - // sub part - for j := cptAdd; j < batchSize; j++ { - // computa lambda - idx := j + offset - d.Neg(&P[idx].Y) - d.Sub(&d, &R[idx].Y) - lambda[j].Mul(&lambda[j], &d) - - // compute X, Y - rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[idx].X) - rr.X.Sub(&rr.X, &P[idx].X) - d.Sub(&R[idx].X, &rr.X) - rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[idx].Y) - R[idx].Set(&rr) - } } // batch inversion diff --git a/ecc/bn254/g2_test.go b/ecc/bn254/g2_test.go index 9d0b38a11a..ae107fea78 100644 --- a/ecc/bn254/g2_test.go +++ b/ecc/bn254/g2_test.go @@ -19,7 +19,6 @@ package bn254 import ( "fmt" "math/big" - "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bn254/internal/fptower" @@ -505,32 +504,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } -func BenchmarkBatchAddG2Affine(b *testing.B) { - var P, R [MAX_BATCH_SIZE]G2Affine - var RR, PP [MAX_BATCH_SIZE]*G2Affine - var ridx [MAX_BATCH_SIZE]int +// func BenchmarkBatchAddG2Affine(b *testing.B) { +// var P, R [MAX_BATCH_SIZE]G2Affine +// var RR, PP [MAX_BATCH_SIZE]*G2Affine +// var ridx [MAX_BATCH_SIZE]int - fillBenchBasesG2(P[:]) - fillBenchBasesG2(R[:]) +// fillBenchBasesG2(P[:]) +// fillBenchBasesG2(R[:]) - for i := 0; i < len(ridx); i++ { - ridx[i] = i - } +// for i:=0; i < len(ridx);i++ { +// ridx[i] = i +// } - // random permute - rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) +// // random permute +// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) - for i, ri := range ridx { - RR[i] = &R[ri] - PP[i] = &P[ri] - } +// for i, ri := range ridx { +// RR[i] = &R[ri] +// PP[i] = &P[ri] +// } - b.ResetTimer() - for i := 0; i < b.N; i++ { - batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) - } +// b.ResetTimer() +// for i := 0; i < b.N; i++ { +// batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) +// } -} +// } func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index 48b5e6e242..9880a1276b 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -56,9 +56,8 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { @@ -66,19 +65,18 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return (cptAdd + cptSub) == batchSize + return (cptAdd) == batchSize } executeAndReset := func() { - if (cptAdd + cptSub) == 0 { + if (cptAdd) == 0 { return } - batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + batchAddG1Affine(R[:cptAdd], P[:cptAdd]) var tmp BS bucketIds = tmp cptAdd = 0 - cptSub = 0 } add := func(op batchOp) { @@ -122,16 +120,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } bucketIds[op.bucketID] = true + R[cptAdd] = BK if op.isNeg() { - cptSub++ - R[batchSize-cptSub] = BK - P[batchSize-cptSub] = PP + P[cptAdd].Neg(PP) } else { - R[cptAdd] = BK - P[cptAdd] = PP - cptAdd++ + P[cptAdd].Set(PP) } - + cptAdd++ } var queue [MAX_BATCH_SIZE]batchOp @@ -284,9 +279,8 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { @@ -294,19 +288,18 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return (cptAdd + cptSub) == batchSize + return (cptAdd) == batchSize } executeAndReset := func() { - if (cptAdd + cptSub) == 0 { + if (cptAdd) == 0 { return } - batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + batchAddG2Affine(R[:cptAdd], P[:cptAdd]) var tmp BS bucketIds = tmp cptAdd = 0 - cptSub = 0 } add := func(op batchOp) { @@ -350,16 +343,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } bucketIds[op.bucketID] = true + R[cptAdd] = BK if op.isNeg() { - cptSub++ - R[batchSize-cptSub] = BK - P[batchSize-cptSub] = PP + P[cptAdd].Neg(PP) } else { - R[cptAdd] = BK - P[cptAdd] = PP - cptAdd++ + P[cptAdd].Set(PP) } - + cptAdd++ } var queue [MAX_BATCH_SIZE]batchOp diff --git a/ecc/bw6-633/g1.go b/ecc/bw6-633/g1.go index 9e61e67732..d70a92aeec 100644 --- a/ecc/bw6-633/g1.go +++ b/ecc/bw6-633/g1.go @@ -1091,23 +1091,17 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // R[:cptAdd], P[:cptAdd] contains points references to ADD // R[N-cptSub:], P[N-cptSub] contains points references to SUB // cptAdd + cptSub == batchSize, and batchSize may be smaller than N -func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { - batchSize := cptAdd + cptSub +func batchAddG1Affine(R []*G1Affine, P []G1Affine) { + batchSize := len(R) if batchSize == 0 { return } var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - j := 0 // add part - for j = 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { lambdain[j].Sub(&P[j].X, &R[j].X) } - // sub part - for i := len(R) - cptSub; i < len(R); i++ { - lambdain[j].Sub(&P[i].X, &R[i].X) - j++ - } // invert denominator batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) @@ -1116,7 +1110,7 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { var rr G1Affine // add part - for j := 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { // computa lambda d.Sub(&P[j].Y, &R[j].Y) lambda[j].Mul(&lambda[j], &d) @@ -1130,27 +1124,6 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } - - // middle of the input may be ignored if cptAdd + cptSub != len(R) - offset := len(R) - batchSize - - // sub part - for j := cptAdd; j < batchSize; j++ { - // computa lambda - idx := j + offset - d.Neg(&P[idx].Y) - d.Sub(&d, &R[idx].Y) - lambda[j].Mul(&lambda[j], &d) - - // compute X, Y - rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[idx].X) - rr.X.Sub(&rr.X, &P[idx].X) - d.Sub(&R[idx].X, &rr.X) - rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[idx].Y) - R[idx].Set(&rr) - } } // batch inversion diff --git a/ecc/bw6-633/g1_test.go b/ecc/bw6-633/g1_test.go index 91e28e75e0..a2b6c273f6 100644 --- a/ecc/bw6-633/g1_test.go +++ b/ecc/bw6-633/g1_test.go @@ -19,7 +19,6 @@ package bw6633 import ( "fmt" "math/big" - "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-633/fp" @@ -500,32 +499,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } -func BenchmarkBatchAddG1Affine(b *testing.B) { - var P, R [MAX_BATCH_SIZE]G1Affine - var RR, PP [MAX_BATCH_SIZE]*G1Affine - var ridx [MAX_BATCH_SIZE]int +// func BenchmarkBatchAddG1Affine(b *testing.B) { +// var P, R [MAX_BATCH_SIZE]G1Affine +// var RR, PP [MAX_BATCH_SIZE]*G1Affine +// var ridx [MAX_BATCH_SIZE]int - fillBenchBasesG1(P[:]) - fillBenchBasesG1(R[:]) +// fillBenchBasesG1(P[:]) +// fillBenchBasesG1(R[:]) - for i := 0; i < len(ridx); i++ { - ridx[i] = i - } +// for i:=0; i < len(ridx);i++ { +// ridx[i] = i +// } - // random permute - rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) +// // random permute +// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) - for i, ri := range ridx { - RR[i] = &R[ri] - PP[i] = &P[ri] - } +// for i, ri := range ridx { +// RR[i] = &R[ri] +// PP[i] = &P[ri] +// } - b.ResetTimer() - for i := 0; i < b.N; i++ { - batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) - } +// b.ResetTimer() +// for i := 0; i < b.N; i++ { +// batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) +// } -} +// } func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bw6-633/g2.go b/ecc/bw6-633/g2.go index 6f021168b5..a84adbb320 100644 --- a/ecc/bw6-633/g2.go +++ b/ecc/bw6-633/g2.go @@ -954,23 +954,17 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // R[:cptAdd], P[:cptAdd] contains points references to ADD // R[N-cptSub:], P[N-cptSub] contains points references to SUB // cptAdd + cptSub == batchSize, and batchSize may be smaller than N -func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { - batchSize := cptAdd + cptSub +func batchAddG2Affine(R []*G2Affine, P []G2Affine) { + batchSize := len(R) if batchSize == 0 { return } var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - j := 0 // add part - for j = 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { lambdain[j].Sub(&P[j].X, &R[j].X) } - // sub part - for i := len(R) - cptSub; i < len(R); i++ { - lambdain[j].Sub(&P[i].X, &R[i].X) - j++ - } // invert denominator batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) @@ -979,7 +973,7 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { var rr G2Affine // add part - for j := 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { // computa lambda d.Sub(&P[j].Y, &R[j].Y) lambda[j].Mul(&lambda[j], &d) @@ -993,27 +987,6 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } - - // middle of the input may be ignored if cptAdd + cptSub != len(R) - offset := len(R) - batchSize - - // sub part - for j := cptAdd; j < batchSize; j++ { - // computa lambda - idx := j + offset - d.Neg(&P[idx].Y) - d.Sub(&d, &R[idx].Y) - lambda[j].Mul(&lambda[j], &d) - - // compute X, Y - rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[idx].X) - rr.X.Sub(&rr.X, &P[idx].X) - d.Sub(&R[idx].X, &rr.X) - rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[idx].Y) - R[idx].Set(&rr) - } } // batch inversion diff --git a/ecc/bw6-633/g2_test.go b/ecc/bw6-633/g2_test.go index a51ae94c50..f5c4d5edca 100644 --- a/ecc/bw6-633/g2_test.go +++ b/ecc/bw6-633/g2_test.go @@ -19,7 +19,6 @@ package bw6633 import ( "fmt" "math/big" - "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-633/fp" @@ -487,32 +486,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } -func BenchmarkBatchAddG2Affine(b *testing.B) { - var P, R [MAX_BATCH_SIZE]G2Affine - var RR, PP [MAX_BATCH_SIZE]*G2Affine - var ridx [MAX_BATCH_SIZE]int +// func BenchmarkBatchAddG2Affine(b *testing.B) { +// var P, R [MAX_BATCH_SIZE]G2Affine +// var RR, PP [MAX_BATCH_SIZE]*G2Affine +// var ridx [MAX_BATCH_SIZE]int - fillBenchBasesG2(P[:]) - fillBenchBasesG2(R[:]) +// fillBenchBasesG2(P[:]) +// fillBenchBasesG2(R[:]) - for i := 0; i < len(ridx); i++ { - ridx[i] = i - } +// for i:=0; i < len(ridx);i++ { +// ridx[i] = i +// } - // random permute - rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) +// // random permute +// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) - for i, ri := range ridx { - RR[i] = &R[ri] - PP[i] = &P[ri] - } +// for i, ri := range ridx { +// RR[i] = &R[ri] +// PP[i] = &P[ri] +// } - b.ResetTimer() - for i := 0; i < b.N; i++ { - batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) - } +// b.ResetTimer() +// for i := 0; i < b.N; i++ { +// batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) +// } -} +// } func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index 74c9b3d4dc..5b679cba91 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -56,9 +56,8 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { @@ -66,19 +65,18 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return (cptAdd + cptSub) == batchSize + return (cptAdd) == batchSize } executeAndReset := func() { - if (cptAdd + cptSub) == 0 { + if (cptAdd) == 0 { return } - batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + batchAddG1Affine(R[:cptAdd], P[:cptAdd]) var tmp BS bucketIds = tmp cptAdd = 0 - cptSub = 0 } add := func(op batchOp) { @@ -122,16 +120,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } bucketIds[op.bucketID] = true + R[cptAdd] = BK if op.isNeg() { - cptSub++ - R[batchSize-cptSub] = BK - P[batchSize-cptSub] = PP + P[cptAdd].Neg(PP) } else { - R[cptAdd] = BK - P[cptAdd] = PP - cptAdd++ + P[cptAdd].Set(PP) } - + cptAdd++ } var queue [MAX_BATCH_SIZE]batchOp @@ -266,9 +261,8 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { @@ -276,19 +270,18 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return (cptAdd + cptSub) == batchSize + return (cptAdd) == batchSize } executeAndReset := func() { - if (cptAdd + cptSub) == 0 { + if (cptAdd) == 0 { return } - batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + batchAddG2Affine(R[:cptAdd], P[:cptAdd]) var tmp BS bucketIds = tmp cptAdd = 0 - cptSub = 0 } add := func(op batchOp) { @@ -332,16 +325,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } bucketIds[op.bucketID] = true + R[cptAdd] = BK if op.isNeg() { - cptSub++ - R[batchSize-cptSub] = BK - P[batchSize-cptSub] = PP + P[cptAdd].Neg(PP) } else { - R[cptAdd] = BK - P[cptAdd] = PP - cptAdd++ + P[cptAdd].Set(PP) } - + cptAdd++ } var queue [MAX_BATCH_SIZE]batchOp diff --git a/ecc/bw6-756/g1.go b/ecc/bw6-756/g1.go index fee3c6884b..57631a43e5 100644 --- a/ecc/bw6-756/g1.go +++ b/ecc/bw6-756/g1.go @@ -1091,23 +1091,17 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // R[:cptAdd], P[:cptAdd] contains points references to ADD // R[N-cptSub:], P[N-cptSub] contains points references to SUB // cptAdd + cptSub == batchSize, and batchSize may be smaller than N -func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { - batchSize := cptAdd + cptSub +func batchAddG1Affine(R []*G1Affine, P []G1Affine) { + batchSize := len(R) if batchSize == 0 { return } var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - j := 0 // add part - for j = 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { lambdain[j].Sub(&P[j].X, &R[j].X) } - // sub part - for i := len(R) - cptSub; i < len(R); i++ { - lambdain[j].Sub(&P[i].X, &R[i].X) - j++ - } // invert denominator batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) @@ -1116,7 +1110,7 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { var rr G1Affine // add part - for j := 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { // computa lambda d.Sub(&P[j].Y, &R[j].Y) lambda[j].Mul(&lambda[j], &d) @@ -1130,27 +1124,6 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } - - // middle of the input may be ignored if cptAdd + cptSub != len(R) - offset := len(R) - batchSize - - // sub part - for j := cptAdd; j < batchSize; j++ { - // computa lambda - idx := j + offset - d.Neg(&P[idx].Y) - d.Sub(&d, &R[idx].Y) - lambda[j].Mul(&lambda[j], &d) - - // compute X, Y - rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[idx].X) - rr.X.Sub(&rr.X, &P[idx].X) - d.Sub(&R[idx].X, &rr.X) - rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[idx].Y) - R[idx].Set(&rr) - } } // batch inversion diff --git a/ecc/bw6-756/g1_test.go b/ecc/bw6-756/g1_test.go index cfc93383c9..bd7a65f693 100644 --- a/ecc/bw6-756/g1_test.go +++ b/ecc/bw6-756/g1_test.go @@ -19,7 +19,6 @@ package bw6756 import ( "fmt" "math/big" - "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-756/fp" @@ -500,32 +499,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } -func BenchmarkBatchAddG1Affine(b *testing.B) { - var P, R [MAX_BATCH_SIZE]G1Affine - var RR, PP [MAX_BATCH_SIZE]*G1Affine - var ridx [MAX_BATCH_SIZE]int +// func BenchmarkBatchAddG1Affine(b *testing.B) { +// var P, R [MAX_BATCH_SIZE]G1Affine +// var RR, PP [MAX_BATCH_SIZE]*G1Affine +// var ridx [MAX_BATCH_SIZE]int - fillBenchBasesG1(P[:]) - fillBenchBasesG1(R[:]) +// fillBenchBasesG1(P[:]) +// fillBenchBasesG1(R[:]) - for i := 0; i < len(ridx); i++ { - ridx[i] = i - } +// for i:=0; i < len(ridx);i++ { +// ridx[i] = i +// } - // random permute - rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) +// // random permute +// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) - for i, ri := range ridx { - RR[i] = &R[ri] - PP[i] = &P[ri] - } +// for i, ri := range ridx { +// RR[i] = &R[ri] +// PP[i] = &P[ri] +// } - b.ResetTimer() - for i := 0; i < b.N; i++ { - batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) - } +// b.ResetTimer() +// for i := 0; i < b.N; i++ { +// batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) +// } -} +// } func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bw6-756/g2.go b/ecc/bw6-756/g2.go index 195322273e..c2b10451c0 100644 --- a/ecc/bw6-756/g2.go +++ b/ecc/bw6-756/g2.go @@ -948,23 +948,17 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // R[:cptAdd], P[:cptAdd] contains points references to ADD // R[N-cptSub:], P[N-cptSub] contains points references to SUB // cptAdd + cptSub == batchSize, and batchSize may be smaller than N -func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { - batchSize := cptAdd + cptSub +func batchAddG2Affine(R []*G2Affine, P []G2Affine) { + batchSize := len(R) if batchSize == 0 { return } var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - j := 0 // add part - for j = 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { lambdain[j].Sub(&P[j].X, &R[j].X) } - // sub part - for i := len(R) - cptSub; i < len(R); i++ { - lambdain[j].Sub(&P[i].X, &R[i].X) - j++ - } // invert denominator batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) @@ -973,7 +967,7 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { var rr G2Affine // add part - for j := 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { // computa lambda d.Sub(&P[j].Y, &R[j].Y) lambda[j].Mul(&lambda[j], &d) @@ -987,27 +981,6 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } - - // middle of the input may be ignored if cptAdd + cptSub != len(R) - offset := len(R) - batchSize - - // sub part - for j := cptAdd; j < batchSize; j++ { - // computa lambda - idx := j + offset - d.Neg(&P[idx].Y) - d.Sub(&d, &R[idx].Y) - lambda[j].Mul(&lambda[j], &d) - - // compute X, Y - rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[idx].X) - rr.X.Sub(&rr.X, &P[idx].X) - d.Sub(&R[idx].X, &rr.X) - rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[idx].Y) - R[idx].Set(&rr) - } } // batch inversion diff --git a/ecc/bw6-756/g2_test.go b/ecc/bw6-756/g2_test.go index 699df087ed..7d98c06668 100644 --- a/ecc/bw6-756/g2_test.go +++ b/ecc/bw6-756/g2_test.go @@ -19,7 +19,6 @@ package bw6756 import ( "fmt" "math/big" - "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-756/fp" @@ -487,32 +486,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } -func BenchmarkBatchAddG2Affine(b *testing.B) { - var P, R [MAX_BATCH_SIZE]G2Affine - var RR, PP [MAX_BATCH_SIZE]*G2Affine - var ridx [MAX_BATCH_SIZE]int +// func BenchmarkBatchAddG2Affine(b *testing.B) { +// var P, R [MAX_BATCH_SIZE]G2Affine +// var RR, PP [MAX_BATCH_SIZE]*G2Affine +// var ridx [MAX_BATCH_SIZE]int - fillBenchBasesG2(P[:]) - fillBenchBasesG2(R[:]) +// fillBenchBasesG2(P[:]) +// fillBenchBasesG2(R[:]) - for i := 0; i < len(ridx); i++ { - ridx[i] = i - } +// for i:=0; i < len(ridx);i++ { +// ridx[i] = i +// } - // random permute - rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) +// // random permute +// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) - for i, ri := range ridx { - RR[i] = &R[ri] - PP[i] = &P[ri] - } +// for i, ri := range ridx { +// RR[i] = &R[ri] +// PP[i] = &P[ri] +// } - b.ResetTimer() - for i := 0; i < b.N; i++ { - batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) - } +// b.ResetTimer() +// for i := 0; i < b.N; i++ { +// batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) +// } -} +// } func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index 02bc11523c..fade05f3ff 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -56,9 +56,8 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { @@ -66,19 +65,18 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return (cptAdd + cptSub) == batchSize + return (cptAdd) == batchSize } executeAndReset := func() { - if (cptAdd + cptSub) == 0 { + if (cptAdd) == 0 { return } - batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + batchAddG1Affine(R[:cptAdd], P[:cptAdd]) var tmp BS bucketIds = tmp cptAdd = 0 - cptSub = 0 } add := func(op batchOp) { @@ -122,16 +120,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } bucketIds[op.bucketID] = true + R[cptAdd] = BK if op.isNeg() { - cptSub++ - R[batchSize-cptSub] = BK - P[batchSize-cptSub] = PP + P[cptAdd].Neg(PP) } else { - R[cptAdd] = BK - P[cptAdd] = PP - cptAdd++ + P[cptAdd].Set(PP) } - + cptAdd++ } var queue [MAX_BATCH_SIZE]batchOp @@ -266,9 +261,8 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { @@ -276,19 +270,18 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return (cptAdd + cptSub) == batchSize + return (cptAdd) == batchSize } executeAndReset := func() { - if (cptAdd + cptSub) == 0 { + if (cptAdd) == 0 { return } - batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + batchAddG2Affine(R[:cptAdd], P[:cptAdd]) var tmp BS bucketIds = tmp cptAdd = 0 - cptSub = 0 } add := func(op batchOp) { @@ -332,16 +325,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } bucketIds[op.bucketID] = true + R[cptAdd] = BK if op.isNeg() { - cptSub++ - R[batchSize-cptSub] = BK - P[batchSize-cptSub] = PP + P[cptAdd].Neg(PP) } else { - R[cptAdd] = BK - P[cptAdd] = PP - cptAdd++ + P[cptAdd].Set(PP) } - + cptAdd++ } var queue [MAX_BATCH_SIZE]batchOp diff --git a/ecc/bw6-761/g1.go b/ecc/bw6-761/g1.go index 3537495b46..08f5e476d9 100644 --- a/ecc/bw6-761/g1.go +++ b/ecc/bw6-761/g1.go @@ -1102,23 +1102,17 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // R[:cptAdd], P[:cptAdd] contains points references to ADD // R[N-cptSub:], P[N-cptSub] contains points references to SUB // cptAdd + cptSub == batchSize, and batchSize may be smaller than N -func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { - batchSize := cptAdd + cptSub +func batchAddG1Affine(R []*G1Affine, P []G1Affine) { + batchSize := len(R) if batchSize == 0 { return } var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - j := 0 // add part - for j = 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { lambdain[j].Sub(&P[j].X, &R[j].X) } - // sub part - for i := len(R) - cptSub; i < len(R); i++ { - lambdain[j].Sub(&P[i].X, &R[i].X) - j++ - } // invert denominator batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) @@ -1127,7 +1121,7 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { var rr G1Affine // add part - for j := 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { // computa lambda d.Sub(&P[j].Y, &R[j].Y) lambda[j].Mul(&lambda[j], &d) @@ -1141,27 +1135,6 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } - - // middle of the input may be ignored if cptAdd + cptSub != len(R) - offset := len(R) - batchSize - - // sub part - for j := cptAdd; j < batchSize; j++ { - // computa lambda - idx := j + offset - d.Neg(&P[idx].Y) - d.Sub(&d, &R[idx].Y) - lambda[j].Mul(&lambda[j], &d) - - // compute X, Y - rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[idx].X) - rr.X.Sub(&rr.X, &P[idx].X) - d.Sub(&R[idx].X, &rr.X) - rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[idx].Y) - R[idx].Set(&rr) - } } // batch inversion diff --git a/ecc/bw6-761/g1_test.go b/ecc/bw6-761/g1_test.go index 5b1b389102..4cbc725f60 100644 --- a/ecc/bw6-761/g1_test.go +++ b/ecc/bw6-761/g1_test.go @@ -19,7 +19,6 @@ package bw6761 import ( "fmt" "math/big" - "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-761/fp" @@ -500,32 +499,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } -func BenchmarkBatchAddG1Affine(b *testing.B) { - var P, R [MAX_BATCH_SIZE]G1Affine - var RR, PP [MAX_BATCH_SIZE]*G1Affine - var ridx [MAX_BATCH_SIZE]int +// func BenchmarkBatchAddG1Affine(b *testing.B) { +// var P, R [MAX_BATCH_SIZE]G1Affine +// var RR, PP [MAX_BATCH_SIZE]*G1Affine +// var ridx [MAX_BATCH_SIZE]int - fillBenchBasesG1(P[:]) - fillBenchBasesG1(R[:]) +// fillBenchBasesG1(P[:]) +// fillBenchBasesG1(R[:]) - for i := 0; i < len(ridx); i++ { - ridx[i] = i - } +// for i:=0; i < len(ridx);i++ { +// ridx[i] = i +// } - // random permute - rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) +// // random permute +// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) - for i, ri := range ridx { - RR[i] = &R[ri] - PP[i] = &P[ri] - } +// for i, ri := range ridx { +// RR[i] = &R[ri] +// PP[i] = &P[ri] +// } - b.ResetTimer() - for i := 0; i < b.N; i++ { - batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) - } +// b.ResetTimer() +// for i := 0; i < b.N; i++ { +// batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) +// } -} +// } func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bw6-761/g2.go b/ecc/bw6-761/g2.go index 41cfea623f..48a7a69586 100644 --- a/ecc/bw6-761/g2.go +++ b/ecc/bw6-761/g2.go @@ -962,23 +962,17 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // R[:cptAdd], P[:cptAdd] contains points references to ADD // R[N-cptSub:], P[N-cptSub] contains points references to SUB // cptAdd + cptSub == batchSize, and batchSize may be smaller than N -func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { - batchSize := cptAdd + cptSub +func batchAddG2Affine(R []*G2Affine, P []G2Affine) { + batchSize := len(R) if batchSize == 0 { return } var lambda, lambdain [MAX_BATCH_SIZE]fp.Element - j := 0 // add part - for j = 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { lambdain[j].Sub(&P[j].X, &R[j].X) } - // sub part - for i := len(R) - cptSub; i < len(R); i++ { - lambdain[j].Sub(&P[i].X, &R[i].X) - j++ - } // invert denominator batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) @@ -987,7 +981,7 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { var rr G2Affine // add part - for j := 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { // computa lambda d.Sub(&P[j].Y, &R[j].Y) lambda[j].Mul(&lambda[j], &d) @@ -1001,27 +995,6 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } - - // middle of the input may be ignored if cptAdd + cptSub != len(R) - offset := len(R) - batchSize - - // sub part - for j := cptAdd; j < batchSize; j++ { - // computa lambda - idx := j + offset - d.Neg(&P[idx].Y) - d.Sub(&d, &R[idx].Y) - lambda[j].Mul(&lambda[j], &d) - - // compute X, Y - rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[idx].X) - rr.X.Sub(&rr.X, &P[idx].X) - d.Sub(&R[idx].X, &rr.X) - rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[idx].Y) - R[idx].Set(&rr) - } } // batch inversion diff --git a/ecc/bw6-761/g2_test.go b/ecc/bw6-761/g2_test.go index 76d8b7f7de..7fa415d6a5 100644 --- a/ecc/bw6-761/g2_test.go +++ b/ecc/bw6-761/g2_test.go @@ -19,7 +19,6 @@ package bw6761 import ( "fmt" "math/big" - "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-761/fp" @@ -487,32 +486,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } -func BenchmarkBatchAddG2Affine(b *testing.B) { - var P, R [MAX_BATCH_SIZE]G2Affine - var RR, PP [MAX_BATCH_SIZE]*G2Affine - var ridx [MAX_BATCH_SIZE]int +// func BenchmarkBatchAddG2Affine(b *testing.B) { +// var P, R [MAX_BATCH_SIZE]G2Affine +// var RR, PP [MAX_BATCH_SIZE]*G2Affine +// var ridx [MAX_BATCH_SIZE]int - fillBenchBasesG2(P[:]) - fillBenchBasesG2(R[:]) +// fillBenchBasesG2(P[:]) +// fillBenchBasesG2(R[:]) - for i := 0; i < len(ridx); i++ { - ridx[i] = i - } +// for i:=0; i < len(ridx);i++ { +// ridx[i] = i +// } - // random permute - rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) +// // random permute +// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) - for i, ri := range ridx { - RR[i] = &R[ri] - PP[i] = &P[ri] - } +// for i, ri := range ridx { +// RR[i] = &R[ri] +// PP[i] = &P[ri] +// } - b.ResetTimer() - for i := 0; i < b.N; i++ { - batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2) - } +// b.ResetTimer() +// for i := 0; i < b.N; i++ { +// batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) +// } -} +// } func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index 4c9c97691d..325f500b60 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -56,9 +56,8 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G1Affine // bucket references canAdd := func(bID uint32) bool { @@ -66,19 +65,18 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return (cptAdd + cptSub) == batchSize + return (cptAdd) == batchSize } executeAndReset := func() { - if (cptAdd + cptSub) == 0 { + if (cptAdd) == 0 { return } - batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + batchAddG1Affine(R[:cptAdd], P[:cptAdd]) var tmp BS bucketIds = tmp cptAdd = 0 - cptSub = 0 } add := func(op batchOp) { @@ -122,16 +120,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, } bucketIds[op.bucketID] = true + R[cptAdd] = BK if op.isNeg() { - cptSub++ - R[batchSize-cptSub] = BK - P[batchSize-cptSub] = PP + P[cptAdd].Neg(PP) } else { - R[cptAdd] = BK - P[cptAdd] = PP - cptAdd++ + P[cptAdd].Set(PP) } - + cptAdd++ } var queue [MAX_BATCH_SIZE]batchOp @@ -266,9 +261,8 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*G2Affine // bucket references canAdd := func(bID uint32) bool { @@ -276,19 +270,18 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } isFull := func() bool { - return (cptAdd + cptSub) == batchSize + return (cptAdd) == batchSize } executeAndReset := func() { - if (cptAdd + cptSub) == 0 { + if (cptAdd) == 0 { return } - batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub) + batchAddG2Affine(R[:cptAdd], P[:cptAdd]) var tmp BS bucketIds = tmp cptAdd = 0 - cptSub = 0 } add := func(op batchOp) { @@ -332,16 +325,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, } bucketIds[op.bucketID] = true + R[cptAdd] = BK if op.isNeg() { - cptSub++ - R[batchSize-cptSub] = BK - P[batchSize-cptSub] = PP + P[cptAdd].Neg(PP) } else { - R[cptAdd] = BK - P[cptAdd] = PP - cptAdd++ + P[cptAdd].Set(PP) } - + cptAdd++ } var queue [MAX_BATCH_SIZE]batchOp diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index 8902aeb919..044f5d3cc9 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -57,9 +57,8 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - cptSub := 0 // count the number of bucket - point added to current batch - var P [MAX_BATCH_SIZE]*{{ $.TAffine }} // points to be added to R (buckets) + var P [MAX_BATCH_SIZE]{{ $.TAffine }} // points to be added to R (buckets) var R [MAX_BATCH_SIZE]*{{ $.TAffine }} // bucket references canAdd := func(bID uint32) bool { @@ -67,19 +66,18 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c } isFull := func() bool { - return (cptAdd+cptSub) == batchSize + return (cptAdd) == batchSize } executeAndReset := func () { - if (cptAdd+cptSub) == 0 { + if (cptAdd) == 0 { return } - batchAdd{{ $.TAffine }}(R[:batchSize], P[:batchSize], cptAdd, cptSub) + batchAdd{{ $.TAffine }}(R[:cptAdd], P[:cptAdd]) var tmp BS bucketIds = tmp cptAdd = 0 - cptSub = 0 } add := func(op batchOp) { @@ -124,16 +122,13 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c bucketIds[op.bucketID] = true + R[cptAdd] = BK if op.isNeg() { - cptSub++ - R[batchSize - cptSub] = BK - P[batchSize - cptSub] = PP + P[cptAdd].Neg(PP) } else { - R[cptAdd] = BK - P[cptAdd] = PP - cptAdd++ + P[cptAdd].Set(PP) } - + cptAdd++ } diff --git a/internal/generator/ecc/template/point.go.tmpl b/internal/generator/ecc/template/point.go.tmpl index d1107bb1c2..2a3b418cbb 100644 --- a/internal/generator/ecc/template/point.go.tmpl +++ b/internal/generator/ecc/template/point.go.tmpl @@ -1578,24 +1578,18 @@ func BatchScalarMultiplication{{ toUpper .PointName }}(base *{{ $TAffine }}, sca // R[:cptAdd], P[:cptAdd] contains points references to ADD // R[N-cptSub:], P[N-cptSub] contains points references to SUB // cptAdd + cptSub == batchSize, and batchSize may be smaller than N -func batchAdd{{ $TAffine }}(R,P []*{{ $TAffine }}, cptAdd, cptSub int) { - batchSize := cptAdd + cptSub +func batchAdd{{ $TAffine }}(R []*{{ $TAffine }},P []{{ $TAffine }}) { + batchSize := len(R) if batchSize == 0 { return } var lambda, lambdain [MAX_BATCH_SIZE]{{.CoordType}} - j := 0 // add part - for j = 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { lambdain[j].Sub(&P[j].X, &R[j].X) } - // sub part - for i:=len(R) - cptSub ;i < len(R); i++ { - lambdain[j].Sub(&P[i].X, &R[i].X) - j++ - } // invert denominator batchInvert{{ $TAffine }}(lambda[:batchSize], lambdain[:batchSize]) @@ -1604,7 +1598,7 @@ func batchAdd{{ $TAffine }}(R,P []*{{ $TAffine }}, cptAdd, cptSub int) { var rr {{ $TAffine }} // add part - for j := 0; j < cptAdd; j++ { + for j := 0; j < batchSize; j++ { // computa lambda d.Sub(&P[j].Y, &R[j].Y) lambda[j].Mul(&lambda[j], &d) @@ -1618,27 +1612,6 @@ func batchAdd{{ $TAffine }}(R,P []*{{ $TAffine }}, cptAdd, cptSub int) { rr.Y.Sub(&rr.Y, &R[j].Y) R[j].Set(&rr) } - - // middle of the input may be ignored if cptAdd + cptSub != len(R) - offset := len(R) - batchSize - - // sub part - for j := cptAdd; j < batchSize; j++ { - // computa lambda - idx := j+offset - d.Neg(&P[idx].Y) - d.Sub(&d, &R[idx].Y) - lambda[j].Mul(&lambda[j], &d) - - // compute X, Y - rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[idx].X) - rr.X.Sub(&rr.X, &P[idx].X) - d.Sub(&R[idx].X, &rr.X) - rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[idx].Y) - R[idx].Set(&rr) - } } diff --git a/internal/generator/ecc/template/tests/point.go.tmpl b/internal/generator/ecc/template/tests/point.go.tmpl index ee54b2dd1b..223bfbe040 100644 --- a/internal/generator/ecc/template/tests/point.go.tmpl +++ b/internal/generator/ecc/template/tests/point.go.tmpl @@ -16,7 +16,6 @@ import ( "fmt" "math/big" "testing" - "math/rand" {{if or (eq .CoordType "fptower.E2") (eq .CoordType "fptower.E4")}} "github.com/consensys/gnark-crypto/ecc/{{.Name}}/internal/fptower" @@ -560,32 +559,32 @@ func Benchmark{{ $TJacobian }}IsInSubGroup(b *testing.B) { } -func BenchmarkBatchAdd{{ $TAffine }}(b *testing.B) { - var P, R [MAX_BATCH_SIZE]{{ $TAffine }} - var RR, PP [MAX_BATCH_SIZE]*{{ $TAffine }} - var ridx [MAX_BATCH_SIZE]int +// func BenchmarkBatchAdd{{ $TAffine }}(b *testing.B) { +// var P, R [MAX_BATCH_SIZE]{{ $TAffine }} +// var RR, PP [MAX_BATCH_SIZE]*{{ $TAffine }} +// var ridx [MAX_BATCH_SIZE]int - fillBenchBases{{ toUpper $.PointName }}(P[:]) - fillBenchBases{{ toUpper $.PointName }}(R[:]) +// fillBenchBases{{ toUpper $.PointName }}(P[:]) +// fillBenchBases{{ toUpper $.PointName }}(R[:]) - for i:=0; i < len(ridx);i++ { - ridx[i] = i - } +// for i:=0; i < len(ridx);i++ { +// ridx[i] = i +// } - // random permute - rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) +// // random permute +// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) - for i, ri := range ridx { - RR[i] = &R[ri] - PP[i] = &P[ri] - } +// for i, ri := range ridx { +// RR[i] = &R[ri] +// PP[i] = &P[ri] +// } - b.ResetTimer() - for i := 0; i < b.N; i++ { - batchAdd{{ $TAffine }}(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) - } +// b.ResetTimer() +// for i := 0; i < b.N; i++ { +// batchAdd{{ $TAffine }}(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) +// } -} +// } func Benchmark{{ $TAffine }}BatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled From d60bf24592e54e114772825230c0d115cdbb1a4d Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Mon, 14 Nov 2022 12:19:30 -0600 Subject: [PATCH 18/43] feat: since we cap c==16 we may as well use uint16 --- ecc/bls12-377/g1.go | 8 ++--- ecc/bls12-377/g2.go | 8 ++--- ecc/bls12-377/multiexp.go | 18 +++++----- ecc/bls12-377/multiexp_affine.go | 35 +++++++++++-------- ecc/bls12-377/multiexp_jacobian.go | 4 +-- ecc/bls12-378/g1.go | 8 ++--- ecc/bls12-378/g2.go | 8 ++--- ecc/bls12-378/multiexp.go | 18 +++++----- ecc/bls12-378/multiexp_affine.go | 35 +++++++++++-------- ecc/bls12-378/multiexp_jacobian.go | 4 +-- ecc/bls12-381/g1.go | 8 ++--- ecc/bls12-381/g2.go | 8 ++--- ecc/bls12-381/multiexp.go | 18 +++++----- ecc/bls12-381/multiexp_affine.go | 35 +++++++++++-------- ecc/bls12-381/multiexp_jacobian.go | 4 +-- ecc/bls24-315/g1.go | 8 ++--- ecc/bls24-315/g2.go | 8 ++--- ecc/bls24-315/multiexp.go | 18 +++++----- ecc/bls24-315/multiexp_affine.go | 35 +++++++++++-------- ecc/bls24-315/multiexp_jacobian.go | 4 +-- ecc/bls24-317/g1.go | 8 ++--- ecc/bls24-317/g2.go | 8 ++--- ecc/bls24-317/multiexp.go | 18 +++++----- ecc/bls24-317/multiexp_affine.go | 35 +++++++++++-------- ecc/bls24-317/multiexp_jacobian.go | 4 +-- ecc/bn254/g1.go | 8 ++--- ecc/bn254/g2.go | 8 ++--- ecc/bn254/multiexp.go | 18 +++++----- ecc/bn254/multiexp_affine.go | 35 +++++++++++-------- ecc/bn254/multiexp_jacobian.go | 4 +-- ecc/bw6-633/g1.go | 8 ++--- ecc/bw6-633/g2.go | 8 ++--- ecc/bw6-633/multiexp.go | 18 +++++----- ecc/bw6-633/multiexp_affine.go | 35 +++++++++++-------- ecc/bw6-633/multiexp_jacobian.go | 4 +-- ecc/bw6-756/g1.go | 8 ++--- ecc/bw6-756/g2.go | 8 ++--- ecc/bw6-756/multiexp.go | 18 +++++----- ecc/bw6-756/multiexp_affine.go | 35 +++++++++++-------- ecc/bw6-756/multiexp_jacobian.go | 4 +-- ecc/bw6-761/g1.go | 8 ++--- ecc/bw6-761/g2.go | 8 ++--- ecc/bw6-761/multiexp.go | 18 +++++----- ecc/bw6-761/multiexp_affine.go | 35 +++++++++++-------- ecc/bw6-761/multiexp_jacobian.go | 4 +-- .../generator/ecc/template/multiexp.go.tmpl | 14 ++++---- .../ecc/template/multiexp_affine.go.tmpl | 20 ++++++----- .../ecc/template/multiexp_jacobian.go.tmpl | 2 +- internal/generator/ecc/template/point.go.tmpl | 8 ++--- 49 files changed, 337 insertions(+), 364 deletions(-) diff --git a/ecc/bls12-377/g1.go b/ecc/bls12-377/g1.go index 3be98b91a4..590c40246e 100644 --- a/ecc/bls12-377/g1.go +++ b/ecc/bls12-377/g1.go @@ -980,13 +980,9 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin return toReturnAff } -// batch add/sub in affine coordinates +// batch add affine coordinates // using batch inversion -// cost add: 5*batchSize M + 1I, dbl: +1M -// len(R) == len(P) == N -// R[:cptAdd], P[:cptAdd] contains points references to ADD -// R[N-cptSub:], P[N-cptSub] contains points references to SUB -// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +// special cases (doubling, infinity) must be filtered out before this call func batchAddG1Affine(R []*G1Affine, P []G1Affine) { batchSize := len(R) if batchSize == 0 { diff --git a/ecc/bls12-377/g2.go b/ecc/bls12-377/g2.go index 4b6f3de628..92a51f4b54 100644 --- a/ecc/bls12-377/g2.go +++ b/ecc/bls12-377/g2.go @@ -976,13 +976,9 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin return toReturn } -// batch add/sub in affine coordinates +// batch add affine coordinates // using batch inversion -// cost add: 5*batchSize M + 1I, dbl: +1M -// len(R) == len(P) == N -// R[:cptAdd], P[:cptAdd] contains points references to ADD -// R[N-cptSub:], P[N-cptSub] contains points references to SUB -// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +// special cases (doubling, infinity) must be filtered out before this call func batchAddG2Affine(R []*G2Affine, P []G2Affine) { batchSize := len(R) if batchSize == 0 { diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index 53a1823e0d..6360f5cb35 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -205,8 +205,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -453,8 +453,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -541,14 +541,14 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar nbChunks := fr.Limbs * 64 / c if (fr.Limbs*64)%c != 0 { nbChunks++ } - toReturn := make([]uint32, len(scalars)*int(nbChunks)) + toReturn := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window @@ -630,11 +630,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint32 + var bits uint16 if digit >= 0 { - bits = uint32(digit) << 1 + bits = uint16(digit) << 1 } else { - bits = (uint32(-digit-1) << 1) + 1 + bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits // [s.index] |= (bits << s.shift) diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index 41c16a3afe..eef9112dda 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -19,7 +19,8 @@ package bls12377 const MAX_BATCH_SIZE = 600 type batchOp struct { - bucketID, pointID uint32 + pointID uint32 + bucketID uint16 } func (o batchOp) isNeg() bool { @@ -36,7 +37,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint32) { + digits []uint16) { // init the buckets var buckets B @@ -57,10 +58,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) - var R [MAX_BATCH_SIZE]*G1Affine // bucket references + // bucket references + var R [MAX_BATCH_SIZE]*G1Affine - canAdd := func(bID uint32) bool { + // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var P [MAX_BATCH_SIZE]G1Affine + + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -168,10 +172,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // if msbWindow bit is set, we need to substract if digit&1 == 0 { // add - op.bucketID = uint32((digit >> 1) - 1) + op.bucketID = uint16((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((digit >> 1))) + op.bucketID = (uint16((digit >> 1))) op.pointID += 1 } if canAdd(op.bucketID) { @@ -188,7 +192,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, executeAndReset() processQueue() } - // queue = append(queue, op) } } @@ -259,7 +262,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint32) { + digits []uint16) { // init the buckets var buckets B @@ -280,10 +283,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) - var R [MAX_BATCH_SIZE]*G2Affine // bucket references + // bucket references + var R [MAX_BATCH_SIZE]*G2Affine + + // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var P [MAX_BATCH_SIZE]G2Affine - canAdd := func(bID uint32) bool { + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -391,10 +397,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // if msbWindow bit is set, we need to substract if digit&1 == 0 { // add - op.bucketID = uint32((digit >> 1) - 1) + op.bucketID = uint16((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((digit >> 1))) + op.bucketID = (uint16((digit >> 1))) op.pointID += 1 } if canAdd(op.bucketID) { @@ -411,7 +417,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, executeAndReset() processQueue() } - // queue = append(queue, op) } } diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go index be722067bd..ae9aca6c47 100644 --- a/ecc/bls12-377/multiexp_jacobian.go +++ b/ecc/bls12-377/multiexp_jacobian.go @@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint32) { + digits []uint16) { var buckets B for i := 0; i < len(buckets); i++ { @@ -99,7 +99,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint32) { + digits []uint16) { var buckets B for i := 0; i < len(buckets); i++ { diff --git a/ecc/bls12-378/g1.go b/ecc/bls12-378/g1.go index 1545108a66..08bd9e0fda 100644 --- a/ecc/bls12-378/g1.go +++ b/ecc/bls12-378/g1.go @@ -980,13 +980,9 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin return toReturnAff } -// batch add/sub in affine coordinates +// batch add affine coordinates // using batch inversion -// cost add: 5*batchSize M + 1I, dbl: +1M -// len(R) == len(P) == N -// R[:cptAdd], P[:cptAdd] contains points references to ADD -// R[N-cptSub:], P[N-cptSub] contains points references to SUB -// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +// special cases (doubling, infinity) must be filtered out before this call func batchAddG1Affine(R []*G1Affine, P []G1Affine) { batchSize := len(R) if batchSize == 0 { diff --git a/ecc/bls12-378/g2.go b/ecc/bls12-378/g2.go index 26aaa42624..840049be26 100644 --- a/ecc/bls12-378/g2.go +++ b/ecc/bls12-378/g2.go @@ -976,13 +976,9 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin return toReturn } -// batch add/sub in affine coordinates +// batch add affine coordinates // using batch inversion -// cost add: 5*batchSize M + 1I, dbl: +1M -// len(R) == len(P) == N -// R[:cptAdd], P[:cptAdd] contains points references to ADD -// R[N-cptSub:], P[N-cptSub] contains points references to SUB -// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +// special cases (doubling, infinity) must be filtered out before this call func batchAddG2Affine(R []*G2Affine, P []G2Affine) { batchSize := len(R) if batchSize == 0 { diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index 73e162f80d..e88325d4ae 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -205,8 +205,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -453,8 +453,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -541,14 +541,14 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar nbChunks := fr.Limbs * 64 / c if (fr.Limbs*64)%c != 0 { nbChunks++ } - toReturn := make([]uint32, len(scalars)*int(nbChunks)) + toReturn := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window @@ -630,11 +630,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint32 + var bits uint16 if digit >= 0 { - bits = uint32(digit) << 1 + bits = uint16(digit) << 1 } else { - bits = (uint32(-digit-1) << 1) + 1 + bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits // [s.index] |= (bits << s.shift) diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index 95eb76b3ac..6023381a17 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -19,7 +19,8 @@ package bls12378 const MAX_BATCH_SIZE = 600 type batchOp struct { - bucketID, pointID uint32 + pointID uint32 + bucketID uint16 } func (o batchOp) isNeg() bool { @@ -36,7 +37,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint32) { + digits []uint16) { // init the buckets var buckets B @@ -57,10 +58,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) - var R [MAX_BATCH_SIZE]*G1Affine // bucket references + // bucket references + var R [MAX_BATCH_SIZE]*G1Affine - canAdd := func(bID uint32) bool { + // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var P [MAX_BATCH_SIZE]G1Affine + + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -168,10 +172,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // if msbWindow bit is set, we need to substract if digit&1 == 0 { // add - op.bucketID = uint32((digit >> 1) - 1) + op.bucketID = uint16((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((digit >> 1))) + op.bucketID = (uint16((digit >> 1))) op.pointID += 1 } if canAdd(op.bucketID) { @@ -188,7 +192,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, executeAndReset() processQueue() } - // queue = append(queue, op) } } @@ -259,7 +262,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint32) { + digits []uint16) { // init the buckets var buckets B @@ -280,10 +283,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) - var R [MAX_BATCH_SIZE]*G2Affine // bucket references + // bucket references + var R [MAX_BATCH_SIZE]*G2Affine + + // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var P [MAX_BATCH_SIZE]G2Affine - canAdd := func(bID uint32) bool { + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -391,10 +397,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // if msbWindow bit is set, we need to substract if digit&1 == 0 { // add - op.bucketID = uint32((digit >> 1) - 1) + op.bucketID = uint16((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((digit >> 1))) + op.bucketID = (uint16((digit >> 1))) op.pointID += 1 } if canAdd(op.bucketID) { @@ -411,7 +417,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, executeAndReset() processQueue() } - // queue = append(queue, op) } } diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go index 6a8cfa2d32..0637114932 100644 --- a/ecc/bls12-378/multiexp_jacobian.go +++ b/ecc/bls12-378/multiexp_jacobian.go @@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint32) { + digits []uint16) { var buckets B for i := 0; i < len(buckets); i++ { @@ -99,7 +99,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint32) { + digits []uint16) { var buckets B for i := 0; i < len(buckets); i++ { diff --git a/ecc/bls12-381/g1.go b/ecc/bls12-381/g1.go index 5a59011791..6c3edabedc 100644 --- a/ecc/bls12-381/g1.go +++ b/ecc/bls12-381/g1.go @@ -980,13 +980,9 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin return toReturnAff } -// batch add/sub in affine coordinates +// batch add affine coordinates // using batch inversion -// cost add: 5*batchSize M + 1I, dbl: +1M -// len(R) == len(P) == N -// R[:cptAdd], P[:cptAdd] contains points references to ADD -// R[N-cptSub:], P[N-cptSub] contains points references to SUB -// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +// special cases (doubling, infinity) must be filtered out before this call func batchAddG1Affine(R []*G1Affine, P []G1Affine) { batchSize := len(R) if batchSize == 0 { diff --git a/ecc/bls12-381/g2.go b/ecc/bls12-381/g2.go index 6b7dfa5639..3768ec7eda 100644 --- a/ecc/bls12-381/g2.go +++ b/ecc/bls12-381/g2.go @@ -977,13 +977,9 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin return toReturn } -// batch add/sub in affine coordinates +// batch add affine coordinates // using batch inversion -// cost add: 5*batchSize M + 1I, dbl: +1M -// len(R) == len(P) == N -// R[:cptAdd], P[:cptAdd] contains points references to ADD -// R[N-cptSub:], P[N-cptSub] contains points references to SUB -// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +// special cases (doubling, infinity) must be filtered out before this call func batchAddG2Affine(R []*G2Affine, P []G2Affine) { batchSize := len(R) if batchSize == 0 { diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index 80ff8bfc30..fabd850c74 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -205,8 +205,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -453,8 +453,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -541,14 +541,14 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar nbChunks := fr.Limbs * 64 / c if (fr.Limbs*64)%c != 0 { nbChunks++ } - toReturn := make([]uint32, len(scalars)*int(nbChunks)) + toReturn := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window @@ -630,11 +630,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint32 + var bits uint16 if digit >= 0 { - bits = uint32(digit) << 1 + bits = uint16(digit) << 1 } else { - bits = (uint32(-digit-1) << 1) + 1 + bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits // [s.index] |= (bits << s.shift) diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index 5a51ee46b6..f388d3d5ea 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -19,7 +19,8 @@ package bls12381 const MAX_BATCH_SIZE = 600 type batchOp struct { - bucketID, pointID uint32 + pointID uint32 + bucketID uint16 } func (o batchOp) isNeg() bool { @@ -36,7 +37,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint32) { + digits []uint16) { // init the buckets var buckets B @@ -57,10 +58,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) - var R [MAX_BATCH_SIZE]*G1Affine // bucket references + // bucket references + var R [MAX_BATCH_SIZE]*G1Affine - canAdd := func(bID uint32) bool { + // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var P [MAX_BATCH_SIZE]G1Affine + + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -168,10 +172,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // if msbWindow bit is set, we need to substract if digit&1 == 0 { // add - op.bucketID = uint32((digit >> 1) - 1) + op.bucketID = uint16((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((digit >> 1))) + op.bucketID = (uint16((digit >> 1))) op.pointID += 1 } if canAdd(op.bucketID) { @@ -188,7 +192,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, executeAndReset() processQueue() } - // queue = append(queue, op) } } @@ -259,7 +262,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint32) { + digits []uint16) { // init the buckets var buckets B @@ -280,10 +283,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) - var R [MAX_BATCH_SIZE]*G2Affine // bucket references + // bucket references + var R [MAX_BATCH_SIZE]*G2Affine + + // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var P [MAX_BATCH_SIZE]G2Affine - canAdd := func(bID uint32) bool { + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -391,10 +397,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // if msbWindow bit is set, we need to substract if digit&1 == 0 { // add - op.bucketID = uint32((digit >> 1) - 1) + op.bucketID = uint16((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((digit >> 1))) + op.bucketID = (uint16((digit >> 1))) op.pointID += 1 } if canAdd(op.bucketID) { @@ -411,7 +417,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, executeAndReset() processQueue() } - // queue = append(queue, op) } } diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go index fabbf2d237..17139a4f22 100644 --- a/ecc/bls12-381/multiexp_jacobian.go +++ b/ecc/bls12-381/multiexp_jacobian.go @@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint32) { + digits []uint16) { var buckets B for i := 0; i < len(buckets); i++ { @@ -99,7 +99,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint32) { + digits []uint16) { var buckets B for i := 0; i < len(buckets); i++ { diff --git a/ecc/bls24-315/g1.go b/ecc/bls24-315/g1.go index cd0d0a8a69..25faa396cf 100644 --- a/ecc/bls24-315/g1.go +++ b/ecc/bls24-315/g1.go @@ -982,13 +982,9 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin return toReturnAff } -// batch add/sub in affine coordinates +// batch add affine coordinates // using batch inversion -// cost add: 5*batchSize M + 1I, dbl: +1M -// len(R) == len(P) == N -// R[:cptAdd], P[:cptAdd] contains points references to ADD -// R[N-cptSub:], P[N-cptSub] contains points references to SUB -// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +// special cases (doubling, infinity) must be filtered out before this call func batchAddG1Affine(R []*G1Affine, P []G1Affine) { batchSize := len(R) if batchSize == 0 { diff --git a/ecc/bls24-315/g2.go b/ecc/bls24-315/g2.go index 7fa2e026c3..32601c0b08 100644 --- a/ecc/bls24-315/g2.go +++ b/ecc/bls24-315/g2.go @@ -992,13 +992,9 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin return toReturn } -// batch add/sub in affine coordinates +// batch add affine coordinates // using batch inversion -// cost add: 5*batchSize M + 1I, dbl: +1M -// len(R) == len(P) == N -// R[:cptAdd], P[:cptAdd] contains points references to ADD -// R[N-cptSub:], P[N-cptSub] contains points references to SUB -// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +// special cases (doubling, infinity) must be filtered out before this call func batchAddG2Affine(R []*G2Affine, P []G2Affine) { batchSize := len(R) if batchSize == 0 { diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index f61ab96f3d..207a4c7f23 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -205,8 +205,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -453,8 +453,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -541,14 +541,14 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar nbChunks := fr.Limbs * 64 / c if (fr.Limbs*64)%c != 0 { nbChunks++ } - toReturn := make([]uint32, len(scalars)*int(nbChunks)) + toReturn := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window @@ -630,11 +630,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint32 + var bits uint16 if digit >= 0 { - bits = uint32(digit) << 1 + bits = uint16(digit) << 1 } else { - bits = (uint32(-digit-1) << 1) + 1 + bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits // [s.index] |= (bits << s.shift) diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index 3714629530..9ec9c35382 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -19,7 +19,8 @@ package bls24315 const MAX_BATCH_SIZE = 600 type batchOp struct { - bucketID, pointID uint32 + pointID uint32 + bucketID uint16 } func (o batchOp) isNeg() bool { @@ -36,7 +37,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint32) { + digits []uint16) { // init the buckets var buckets B @@ -57,10 +58,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) - var R [MAX_BATCH_SIZE]*G1Affine // bucket references + // bucket references + var R [MAX_BATCH_SIZE]*G1Affine - canAdd := func(bID uint32) bool { + // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var P [MAX_BATCH_SIZE]G1Affine + + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -168,10 +172,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // if msbWindow bit is set, we need to substract if digit&1 == 0 { // add - op.bucketID = uint32((digit >> 1) - 1) + op.bucketID = uint16((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((digit >> 1))) + op.bucketID = (uint16((digit >> 1))) op.pointID += 1 } if canAdd(op.bucketID) { @@ -188,7 +192,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, executeAndReset() processQueue() } - // queue = append(queue, op) } } @@ -259,7 +262,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint32) { + digits []uint16) { // init the buckets var buckets B @@ -280,10 +283,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) - var R [MAX_BATCH_SIZE]*G2Affine // bucket references + // bucket references + var R [MAX_BATCH_SIZE]*G2Affine + + // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var P [MAX_BATCH_SIZE]G2Affine - canAdd := func(bID uint32) bool { + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -391,10 +397,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // if msbWindow bit is set, we need to substract if digit&1 == 0 { // add - op.bucketID = uint32((digit >> 1) - 1) + op.bucketID = uint16((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((digit >> 1))) + op.bucketID = (uint16((digit >> 1))) op.pointID += 1 } if canAdd(op.bucketID) { @@ -411,7 +417,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, executeAndReset() processQueue() } - // queue = append(queue, op) } } diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go index a3d633de01..6e3ea0e2f9 100644 --- a/ecc/bls24-315/multiexp_jacobian.go +++ b/ecc/bls24-315/multiexp_jacobian.go @@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint32) { + digits []uint16) { var buckets B for i := 0; i < len(buckets); i++ { @@ -99,7 +99,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint32) { + digits []uint16) { var buckets B for i := 0; i < len(buckets); i++ { diff --git a/ecc/bls24-317/g1.go b/ecc/bls24-317/g1.go index 4a28082836..f1e3773049 100644 --- a/ecc/bls24-317/g1.go +++ b/ecc/bls24-317/g1.go @@ -982,13 +982,9 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin return toReturnAff } -// batch add/sub in affine coordinates +// batch add affine coordinates // using batch inversion -// cost add: 5*batchSize M + 1I, dbl: +1M -// len(R) == len(P) == N -// R[:cptAdd], P[:cptAdd] contains points references to ADD -// R[N-cptSub:], P[N-cptSub] contains points references to SUB -// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +// special cases (doubling, infinity) must be filtered out before this call func batchAddG1Affine(R []*G1Affine, P []G1Affine) { batchSize := len(R) if batchSize == 0 { diff --git a/ecc/bls24-317/g2.go b/ecc/bls24-317/g2.go index acd1176cdd..0f1693cdc8 100644 --- a/ecc/bls24-317/g2.go +++ b/ecc/bls24-317/g2.go @@ -992,13 +992,9 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin return toReturn } -// batch add/sub in affine coordinates +// batch add affine coordinates // using batch inversion -// cost add: 5*batchSize M + 1I, dbl: +1M -// len(R) == len(P) == N -// R[:cptAdd], P[:cptAdd] contains points references to ADD -// R[N-cptSub:], P[N-cptSub] contains points references to SUB -// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +// special cases (doubling, infinity) must be filtered out before this call func batchAddG2Affine(R []*G2Affine, P []G2Affine) { batchSize := len(R) if batchSize == 0 { diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index 8b81840e50..b9baa2cec7 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -205,8 +205,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -453,8 +453,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -541,14 +541,14 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar nbChunks := fr.Limbs * 64 / c if (fr.Limbs*64)%c != 0 { nbChunks++ } - toReturn := make([]uint32, len(scalars)*int(nbChunks)) + toReturn := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window @@ -630,11 +630,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint32 + var bits uint16 if digit >= 0 { - bits = uint32(digit) << 1 + bits = uint16(digit) << 1 } else { - bits = (uint32(-digit-1) << 1) + 1 + bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits // [s.index] |= (bits << s.shift) diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index 789e3e18be..e27eb9efeb 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -19,7 +19,8 @@ package bls24317 const MAX_BATCH_SIZE = 600 type batchOp struct { - bucketID, pointID uint32 + pointID uint32 + bucketID uint16 } func (o batchOp) isNeg() bool { @@ -36,7 +37,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint32) { + digits []uint16) { // init the buckets var buckets B @@ -57,10 +58,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) - var R [MAX_BATCH_SIZE]*G1Affine // bucket references + // bucket references + var R [MAX_BATCH_SIZE]*G1Affine - canAdd := func(bID uint32) bool { + // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var P [MAX_BATCH_SIZE]G1Affine + + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -168,10 +172,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // if msbWindow bit is set, we need to substract if digit&1 == 0 { // add - op.bucketID = uint32((digit >> 1) - 1) + op.bucketID = uint16((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((digit >> 1))) + op.bucketID = (uint16((digit >> 1))) op.pointID += 1 } if canAdd(op.bucketID) { @@ -188,7 +192,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, executeAndReset() processQueue() } - // queue = append(queue, op) } } @@ -259,7 +262,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint32) { + digits []uint16) { // init the buckets var buckets B @@ -280,10 +283,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) - var R [MAX_BATCH_SIZE]*G2Affine // bucket references + // bucket references + var R [MAX_BATCH_SIZE]*G2Affine + + // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var P [MAX_BATCH_SIZE]G2Affine - canAdd := func(bID uint32) bool { + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -391,10 +397,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // if msbWindow bit is set, we need to substract if digit&1 == 0 { // add - op.bucketID = uint32((digit >> 1) - 1) + op.bucketID = uint16((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((digit >> 1))) + op.bucketID = (uint16((digit >> 1))) op.pointID += 1 } if canAdd(op.bucketID) { @@ -411,7 +417,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, executeAndReset() processQueue() } - // queue = append(queue, op) } } diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go index 7e832db4e7..c4fc41bc54 100644 --- a/ecc/bls24-317/multiexp_jacobian.go +++ b/ecc/bls24-317/multiexp_jacobian.go @@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint32) { + digits []uint16) { var buckets B for i := 0; i < len(buckets); i++ { @@ -99,7 +99,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint32) { + digits []uint16) { var buckets B for i := 0; i < len(buckets); i++ { diff --git a/ecc/bn254/g1.go b/ecc/bn254/g1.go index 84e72c7c5f..75a3e25983 100644 --- a/ecc/bn254/g1.go +++ b/ecc/bn254/g1.go @@ -952,13 +952,9 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin return toReturnAff } -// batch add/sub in affine coordinates +// batch add affine coordinates // using batch inversion -// cost add: 5*batchSize M + 1I, dbl: +1M -// len(R) == len(P) == N -// R[:cptAdd], P[:cptAdd] contains points references to ADD -// R[N-cptSub:], P[N-cptSub] contains points references to SUB -// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +// special cases (doubling, infinity) must be filtered out before this call func batchAddG1Affine(R []*G1Affine, P []G1Affine) { batchSize := len(R) if batchSize == 0 { diff --git a/ecc/bn254/g2.go b/ecc/bn254/g2.go index 2bef2cba4c..6437e4542e 100644 --- a/ecc/bn254/g2.go +++ b/ecc/bn254/g2.go @@ -981,13 +981,9 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin return toReturn } -// batch add/sub in affine coordinates +// batch add affine coordinates // using batch inversion -// cost add: 5*batchSize M + 1I, dbl: +1M -// len(R) == len(P) == N -// R[:cptAdd], P[:cptAdd] contains points references to ADD -// R[N-cptSub:], P[N-cptSub] contains points references to SUB -// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +// special cases (doubling, infinity) must be filtered out before this call func batchAddG2Affine(R []*G2Affine, P []G2Affine) { batchSize := len(R) if batchSize == 0 { diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index ac979ddff7..75e8a96061 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -205,8 +205,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -453,8 +453,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -541,14 +541,14 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar nbChunks := fr.Limbs * 64 / c if (fr.Limbs*64)%c != 0 { nbChunks++ } - toReturn := make([]uint32, len(scalars)*int(nbChunks)) + toReturn := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window @@ -630,11 +630,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint32 + var bits uint16 if digit >= 0 { - bits = uint32(digit) << 1 + bits = uint16(digit) << 1 } else { - bits = (uint32(-digit-1) << 1) + 1 + bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits // [s.index] |= (bits << s.shift) diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index 9880a1276b..1f6ba85280 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -19,7 +19,8 @@ package bn254 const MAX_BATCH_SIZE = 600 type batchOp struct { - bucketID, pointID uint32 + pointID uint32 + bucketID uint16 } func (o batchOp) isNeg() bool { @@ -36,7 +37,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint32) { + digits []uint16) { // init the buckets var buckets B @@ -57,10 +58,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) - var R [MAX_BATCH_SIZE]*G1Affine // bucket references + // bucket references + var R [MAX_BATCH_SIZE]*G1Affine - canAdd := func(bID uint32) bool { + // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var P [MAX_BATCH_SIZE]G1Affine + + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -168,10 +172,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // if msbWindow bit is set, we need to substract if digit&1 == 0 { // add - op.bucketID = uint32((digit >> 1) - 1) + op.bucketID = uint16((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((digit >> 1))) + op.bucketID = (uint16((digit >> 1))) op.pointID += 1 } if canAdd(op.bucketID) { @@ -188,7 +192,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, executeAndReset() processQueue() } - // queue = append(queue, op) } } @@ -259,7 +262,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint32) { + digits []uint16) { // init the buckets var buckets B @@ -280,10 +283,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) - var R [MAX_BATCH_SIZE]*G2Affine // bucket references + // bucket references + var R [MAX_BATCH_SIZE]*G2Affine + + // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var P [MAX_BATCH_SIZE]G2Affine - canAdd := func(bID uint32) bool { + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -391,10 +397,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // if msbWindow bit is set, we need to substract if digit&1 == 0 { // add - op.bucketID = uint32((digit >> 1) - 1) + op.bucketID = uint16((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((digit >> 1))) + op.bucketID = (uint16((digit >> 1))) op.pointID += 1 } if canAdd(op.bucketID) { @@ -411,7 +417,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, executeAndReset() processQueue() } - // queue = append(queue, op) } } diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go index a682232ec6..288063d39a 100644 --- a/ecc/bn254/multiexp_jacobian.go +++ b/ecc/bn254/multiexp_jacobian.go @@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint32) { + digits []uint16) { var buckets B for i := 0; i < len(buckets); i++ { @@ -99,7 +99,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint32) { + digits []uint16) { var buckets B for i := 0; i < len(buckets); i++ { diff --git a/ecc/bw6-633/g1.go b/ecc/bw6-633/g1.go index d70a92aeec..707fbefb6f 100644 --- a/ecc/bw6-633/g1.go +++ b/ecc/bw6-633/g1.go @@ -1084,13 +1084,9 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin return toReturnAff } -// batch add/sub in affine coordinates +// batch add affine coordinates // using batch inversion -// cost add: 5*batchSize M + 1I, dbl: +1M -// len(R) == len(P) == N -// R[:cptAdd], P[:cptAdd] contains points references to ADD -// R[N-cptSub:], P[N-cptSub] contains points references to SUB -// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +// special cases (doubling, infinity) must be filtered out before this call func batchAddG1Affine(R []*G1Affine, P []G1Affine) { batchSize := len(R) if batchSize == 0 { diff --git a/ecc/bw6-633/g2.go b/ecc/bw6-633/g2.go index a84adbb320..69e4b4263c 100644 --- a/ecc/bw6-633/g2.go +++ b/ecc/bw6-633/g2.go @@ -947,13 +947,9 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin return toReturn } -// batch add/sub in affine coordinates +// batch add affine coordinates // using batch inversion -// cost add: 5*batchSize M + 1I, dbl: +1M -// len(R) == len(P) == N -// R[:cptAdd], P[:cptAdd] contains points references to ADD -// R[N-cptSub:], P[N-cptSub] contains points references to SUB -// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +// special cases (doubling, infinity) must be filtered out before this call func batchAddG2Affine(R []*G2Affine, P []G2Affine) { batchSize := len(R) if batchSize == 0 { diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index 23c35d3d90..fb6367cb6b 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -168,8 +168,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -379,8 +379,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -467,14 +467,14 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar nbChunks := fr.Limbs * 64 / c if (fr.Limbs*64)%c != 0 { nbChunks++ } - toReturn := make([]uint32, len(scalars)*int(nbChunks)) + toReturn := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window @@ -556,11 +556,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint32 + var bits uint16 if digit >= 0 { - bits = uint32(digit) << 1 + bits = uint16(digit) << 1 } else { - bits = (uint32(-digit-1) << 1) + 1 + bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits // [s.index] |= (bits << s.shift) diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index 5b679cba91..38c1973d0e 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -19,7 +19,8 @@ package bw6633 const MAX_BATCH_SIZE = 600 type batchOp struct { - bucketID, pointID uint32 + pointID uint32 + bucketID uint16 } func (o batchOp) isNeg() bool { @@ -36,7 +37,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint32) { + digits []uint16) { // init the buckets var buckets B @@ -57,10 +58,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) - var R [MAX_BATCH_SIZE]*G1Affine // bucket references + // bucket references + var R [MAX_BATCH_SIZE]*G1Affine - canAdd := func(bID uint32) bool { + // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var P [MAX_BATCH_SIZE]G1Affine + + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -168,10 +172,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // if msbWindow bit is set, we need to substract if digit&1 == 0 { // add - op.bucketID = uint32((digit >> 1) - 1) + op.bucketID = uint16((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((digit >> 1))) + op.bucketID = (uint16((digit >> 1))) op.pointID += 1 } if canAdd(op.bucketID) { @@ -188,7 +192,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, executeAndReset() processQueue() } - // queue = append(queue, op) } } @@ -241,7 +244,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint32) { + digits []uint16) { // init the buckets var buckets B @@ -262,10 +265,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) - var R [MAX_BATCH_SIZE]*G2Affine // bucket references + // bucket references + var R [MAX_BATCH_SIZE]*G2Affine + + // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var P [MAX_BATCH_SIZE]G2Affine - canAdd := func(bID uint32) bool { + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -373,10 +379,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // if msbWindow bit is set, we need to substract if digit&1 == 0 { // add - op.bucketID = uint32((digit >> 1) - 1) + op.bucketID = uint16((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((digit >> 1))) + op.bucketID = (uint16((digit >> 1))) op.pointID += 1 } if canAdd(op.bucketID) { @@ -393,7 +399,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, executeAndReset() processQueue() } - // queue = append(queue, op) } } diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go index 29756cc499..e39d7fc165 100644 --- a/ecc/bw6-633/multiexp_jacobian.go +++ b/ecc/bw6-633/multiexp_jacobian.go @@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint32) { + digits []uint16) { var buckets B for i := 0; i < len(buckets); i++ { @@ -77,7 +77,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint32) { + digits []uint16) { var buckets B for i := 0; i < len(buckets); i++ { diff --git a/ecc/bw6-756/g1.go b/ecc/bw6-756/g1.go index 57631a43e5..395f0a7f84 100644 --- a/ecc/bw6-756/g1.go +++ b/ecc/bw6-756/g1.go @@ -1084,13 +1084,9 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin return toReturnAff } -// batch add/sub in affine coordinates +// batch add affine coordinates // using batch inversion -// cost add: 5*batchSize M + 1I, dbl: +1M -// len(R) == len(P) == N -// R[:cptAdd], P[:cptAdd] contains points references to ADD -// R[N-cptSub:], P[N-cptSub] contains points references to SUB -// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +// special cases (doubling, infinity) must be filtered out before this call func batchAddG1Affine(R []*G1Affine, P []G1Affine) { batchSize := len(R) if batchSize == 0 { diff --git a/ecc/bw6-756/g2.go b/ecc/bw6-756/g2.go index c2b10451c0..63a4631e6d 100644 --- a/ecc/bw6-756/g2.go +++ b/ecc/bw6-756/g2.go @@ -941,13 +941,9 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin return toReturn } -// batch add/sub in affine coordinates +// batch add affine coordinates // using batch inversion -// cost add: 5*batchSize M + 1I, dbl: +1M -// len(R) == len(P) == N -// R[:cptAdd], P[:cptAdd] contains points references to ADD -// R[N-cptSub:], P[N-cptSub] contains points references to SUB -// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +// special cases (doubling, infinity) must be filtered out before this call func batchAddG2Affine(R []*G2Affine, P []G2Affine) { batchSize := len(R) if batchSize == 0 { diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index 0124b603f4..0ba6a6ed57 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -169,8 +169,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -381,8 +381,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -469,14 +469,14 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar nbChunks := fr.Limbs * 64 / c if (fr.Limbs*64)%c != 0 { nbChunks++ } - toReturn := make([]uint32, len(scalars)*int(nbChunks)) + toReturn := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window @@ -558,11 +558,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint32 + var bits uint16 if digit >= 0 { - bits = uint32(digit) << 1 + bits = uint16(digit) << 1 } else { - bits = (uint32(-digit-1) << 1) + 1 + bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits // [s.index] |= (bits << s.shift) diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index fade05f3ff..c59b38e882 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -19,7 +19,8 @@ package bw6756 const MAX_BATCH_SIZE = 600 type batchOp struct { - bucketID, pointID uint32 + pointID uint32 + bucketID uint16 } func (o batchOp) isNeg() bool { @@ -36,7 +37,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint32) { + digits []uint16) { // init the buckets var buckets B @@ -57,10 +58,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) - var R [MAX_BATCH_SIZE]*G1Affine // bucket references + // bucket references + var R [MAX_BATCH_SIZE]*G1Affine - canAdd := func(bID uint32) bool { + // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var P [MAX_BATCH_SIZE]G1Affine + + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -168,10 +172,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // if msbWindow bit is set, we need to substract if digit&1 == 0 { // add - op.bucketID = uint32((digit >> 1) - 1) + op.bucketID = uint16((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((digit >> 1))) + op.bucketID = (uint16((digit >> 1))) op.pointID += 1 } if canAdd(op.bucketID) { @@ -188,7 +192,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, executeAndReset() processQueue() } - // queue = append(queue, op) } } @@ -241,7 +244,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint32) { + digits []uint16) { // init the buckets var buckets B @@ -262,10 +265,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) - var R [MAX_BATCH_SIZE]*G2Affine // bucket references + // bucket references + var R [MAX_BATCH_SIZE]*G2Affine + + // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var P [MAX_BATCH_SIZE]G2Affine - canAdd := func(bID uint32) bool { + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -373,10 +379,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // if msbWindow bit is set, we need to substract if digit&1 == 0 { // add - op.bucketID = uint32((digit >> 1) - 1) + op.bucketID = uint16((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((digit >> 1))) + op.bucketID = (uint16((digit >> 1))) op.pointID += 1 } if canAdd(op.bucketID) { @@ -393,7 +399,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, executeAndReset() processQueue() } - // queue = append(queue, op) } } diff --git a/ecc/bw6-756/multiexp_jacobian.go b/ecc/bw6-756/multiexp_jacobian.go index 10a354ae58..0cba708584 100644 --- a/ecc/bw6-756/multiexp_jacobian.go +++ b/ecc/bw6-756/multiexp_jacobian.go @@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint32) { + digits []uint16) { var buckets B for i := 0; i < len(buckets); i++ { @@ -77,7 +77,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint32) { + digits []uint16) { var buckets B for i := 0; i < len(buckets); i++ { diff --git a/ecc/bw6-761/g1.go b/ecc/bw6-761/g1.go index 08f5e476d9..880371b042 100644 --- a/ecc/bw6-761/g1.go +++ b/ecc/bw6-761/g1.go @@ -1095,13 +1095,9 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin return toReturnAff } -// batch add/sub in affine coordinates +// batch add affine coordinates // using batch inversion -// cost add: 5*batchSize M + 1I, dbl: +1M -// len(R) == len(P) == N -// R[:cptAdd], P[:cptAdd] contains points references to ADD -// R[N-cptSub:], P[N-cptSub] contains points references to SUB -// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +// special cases (doubling, infinity) must be filtered out before this call func batchAddG1Affine(R []*G1Affine, P []G1Affine) { batchSize := len(R) if batchSize == 0 { diff --git a/ecc/bw6-761/g2.go b/ecc/bw6-761/g2.go index 48a7a69586..892cedad40 100644 --- a/ecc/bw6-761/g2.go +++ b/ecc/bw6-761/g2.go @@ -955,13 +955,9 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin return toReturn } -// batch add/sub in affine coordinates +// batch add affine coordinates // using batch inversion -// cost add: 5*batchSize M + 1I, dbl: +1M -// len(R) == len(P) == N -// R[:cptAdd], P[:cptAdd] contains points references to ADD -// R[N-cptSub:], P[N-cptSub] contains points references to SUB -// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +// special cases (doubling, infinity) must be filtered out before this call func batchAddG2Affine(R []*G2Affine, P []G2Affine) { batchSize := len(R) if batchSize == 0 { diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index fc2c7c4908..f8165ca221 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -169,8 +169,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -381,8 +381,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { @@ -469,14 +469,14 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar nbChunks := fr.Limbs * 64 / c if (fr.Limbs*64)%c != 0 { nbChunks++ } - toReturn := make([]uint32, len(scalars)*int(nbChunks)) + toReturn := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window @@ -558,11 +558,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint32 + var bits uint16 if digit >= 0 { - bits = uint32(digit) << 1 + bits = uint16(digit) << 1 } else { - bits = (uint32(-digit-1) << 1) + 1 + bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits // [s.index] |= (bits << s.shift) diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index 325f500b60..83b2c11fbe 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -19,7 +19,8 @@ package bw6761 const MAX_BATCH_SIZE = 600 type batchOp struct { - bucketID, pointID uint32 + pointID uint32 + bucketID uint16 } func (o batchOp) isNeg() bool { @@ -36,7 +37,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint32) { + digits []uint16) { // init the buckets var buckets B @@ -57,10 +58,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - var P [MAX_BATCH_SIZE]G1Affine // points to be added to R (buckets) - var R [MAX_BATCH_SIZE]*G1Affine // bucket references + // bucket references + var R [MAX_BATCH_SIZE]*G1Affine - canAdd := func(bID uint32) bool { + // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var P [MAX_BATCH_SIZE]G1Affine + + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -168,10 +172,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // if msbWindow bit is set, we need to substract if digit&1 == 0 { // add - op.bucketID = uint32((digit >> 1) - 1) + op.bucketID = uint16((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((digit >> 1))) + op.bucketID = (uint16((digit >> 1))) op.pointID += 1 } if canAdd(op.bucketID) { @@ -188,7 +192,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, executeAndReset() processQueue() } - // queue = append(queue, op) } } @@ -241,7 +244,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint32) { + digits []uint16) { // init the buckets var buckets B @@ -262,10 +265,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - var P [MAX_BATCH_SIZE]G2Affine // points to be added to R (buckets) - var R [MAX_BATCH_SIZE]*G2Affine // bucket references + // bucket references + var R [MAX_BATCH_SIZE]*G2Affine + + // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var P [MAX_BATCH_SIZE]G2Affine - canAdd := func(bID uint32) bool { + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -373,10 +379,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // if msbWindow bit is set, we need to substract if digit&1 == 0 { // add - op.bucketID = uint32((digit >> 1) - 1) + op.bucketID = uint16((digit >> 1) - 1) } else { // sub - op.bucketID = (uint32((digit >> 1))) + op.bucketID = (uint16((digit >> 1))) op.pointID += 1 } if canAdd(op.bucketID) { @@ -393,7 +399,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, executeAndReset() processQueue() } - // queue = append(queue, op) } } diff --git a/ecc/bw6-761/multiexp_jacobian.go b/ecc/bw6-761/multiexp_jacobian.go index 045bace5e7..af2d68b853 100644 --- a/ecc/bw6-761/multiexp_jacobian.go +++ b/ecc/bw6-761/multiexp_jacobian.go @@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, - digits []uint32) { + digits []uint16) { var buckets B for i := 0; i < len(buckets); i++ { @@ -77,7 +77,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, - digits []uint32) { + digits []uint16) { var buckets B for i := 0; i < len(buckets); i++ { diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index 5940c0e8c5..4655925e5e 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -40,14 +40,14 @@ type selector struct { // scalarsMont indicates wheter the provided scalars are in montgomery form // returns smallValues, which represent the number of scalars which meets the following condition // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar nbChunks := fr.Limbs * 64 / c if (fr.Limbs * 64)%c != 0 { nbChunks++ } - toReturn := make([]uint32, len(scalars)*int(nbChunks)) + toReturn := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window @@ -131,11 +131,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint32 + var bits uint16 if digit >= 0 { - bits = uint32(digit) << 1 + bits = uint16(digit) << 1 } else { - bits = (uint32(-digit-1) << 1) + 1 + bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits // [s.index] |= (bits << s.shift) @@ -447,8 +447,8 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi } } -func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint32, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint32)) *{{ $.TJacobian }} { +func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint16, splitFirstChunk bool, + processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16)) *{{ $.TJacobian }} { nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar if (fr.Limbs*64)%c != 0 { diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index 044f5d3cc9..cf1c0ce71f 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -10,7 +10,8 @@ const MAX_BATCH_SIZE = 600 type batchOp struct { - bucketID, pointID uint32 + pointID uint32 + bucketID uint16 } func (o batchOp) isNeg() bool { @@ -37,7 +38,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, - digits []uint32) { + digits []uint16) { // init the buckets var buckets B @@ -58,10 +59,14 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - var P [MAX_BATCH_SIZE]{{ $.TAffine }} // points to be added to R (buckets) - var R [MAX_BATCH_SIZE]*{{ $.TAffine }} // bucket references + // bucket references + var R [MAX_BATCH_SIZE]*{{ $.TAffine }} - canAdd := func(bID uint32) bool { + // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var P [MAX_BATCH_SIZE]{{ $.TAffine }} + + + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -171,10 +176,10 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c // if msbWindow bit is set, we need to substract if digit&1 == 0 { // add - op.bucketID = uint32((digit>>1) - 1) + op.bucketID = uint16((digit>>1) - 1) } else { // sub - op.bucketID = (uint32((digit>>1))) + op.bucketID = (uint16((digit>>1))) op.pointID += 1 } if canAdd(op.bucketID) { @@ -191,7 +196,6 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c executeAndReset() processQueue() } - // queue = append(queue, op) } } diff --git a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl index ee1f1d2080..7aaec9f186 100644 --- a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl +++ b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl @@ -19,7 +19,7 @@ func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, - digits []uint32) { + digits []uint16) { diff --git a/internal/generator/ecc/template/point.go.tmpl b/internal/generator/ecc/template/point.go.tmpl index 2a3b418cbb..6e30ad9de8 100644 --- a/internal/generator/ecc/template/point.go.tmpl +++ b/internal/generator/ecc/template/point.go.tmpl @@ -1571,13 +1571,9 @@ func BatchScalarMultiplication{{ toUpper .PointName }}(base *{{ $TAffine }}, sca -// batch add/sub in affine coordinates +// batch add affine coordinates // using batch inversion -// cost add: 5*batchSize M + 1I, dbl: +1M -// len(R) == len(P) == N -// R[:cptAdd], P[:cptAdd] contains points references to ADD -// R[N-cptSub:], P[N-cptSub] contains points references to SUB -// cptAdd + cptSub == batchSize, and batchSize may be smaller than N +// special cases (doubling, infinity) must be filtered out before this call func batchAdd{{ $TAffine }}(R []*{{ $TAffine }},P []{{ $TAffine }}) { batchSize := len(R) if batchSize == 0 { From 3fd6c7e5f21cc68ca0562317c118ff65513e0f20 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Mon, 14 Nov 2022 15:54:41 -0600 Subject: [PATCH 19/43] perf: allocate batch affine arrays on the stack with generics --- ecc/bls12-377/g1.go | 61 +-- ecc/bls12-377/g2.go | 61 +-- ecc/bls12-377/multiexp.go | 28 +- ecc/bls12-377/multiexp_affine.go | 494 ++++++++++++------ ecc/bls12-378/g1.go | 61 +-- ecc/bls12-378/g2.go | 61 +-- ecc/bls12-378/multiexp.go | 28 +- ecc/bls12-378/multiexp_affine.go | 494 ++++++++++++------ ecc/bls12-381/g1.go | 61 +-- ecc/bls12-381/g2.go | 61 +-- ecc/bls12-381/multiexp.go | 28 +- ecc/bls12-381/multiexp_affine.go | 494 ++++++++++++------ ecc/bls24-315/g1.go | 61 +-- ecc/bls24-315/g2.go | 61 +-- ecc/bls24-315/multiexp.go | 28 +- ecc/bls24-315/multiexp_affine.go | 494 ++++++++++++------ ecc/bls24-317/g1.go | 61 +-- ecc/bls24-317/g2.go | 61 +-- ecc/bls24-317/multiexp.go | 28 +- ecc/bls24-317/multiexp_affine.go | 494 ++++++++++++------ ecc/bn254/g1.go | 61 +-- ecc/bn254/g2.go | 61 +-- ecc/bn254/multiexp.go | 28 +- ecc/bn254/multiexp_affine.go | 494 ++++++++++++------ ecc/bw6-633/g1.go | 61 +-- ecc/bw6-633/g2.go | 61 +-- ecc/bw6-633/multiexp.go | 4 +- ecc/bw6-633/multiexp_affine.go | 386 ++++++++------ ecc/bw6-756/g1.go | 61 +-- ecc/bw6-756/g2.go | 61 +-- ecc/bw6-756/multiexp.go | 4 +- ecc/bw6-756/multiexp_affine.go | 386 ++++++++------ ecc/bw6-761/g1.go | 61 +-- ecc/bw6-761/g2.go | 61 +-- ecc/bw6-761/multiexp.go | 4 +- ecc/bw6-761/multiexp_affine.go | 386 ++++++++------ internal/generator/ecc/generate.go | 32 ++ .../generator/ecc/template/multiexp.go.tmpl | 4 +- .../ecc/template/multiexp_affine.go.tmpl | 236 ++++++--- internal/generator/ecc/template/point.go.tmpl | 62 +-- 40 files changed, 3504 insertions(+), 2230 deletions(-) diff --git a/ecc/bls12-377/g1.go b/ecc/bls12-377/g1.go index 590c40246e..910dd07b5e 100644 --- a/ecc/bls12-377/g1.go +++ b/ecc/bls12-377/g1.go @@ -983,20 +983,31 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // batch add affine coordinates // using batch inversion // special cases (doubling, infinity) must be filtered out before this call -func batchAddG1Affine(R []*G1Affine, P []G1Affine) { - batchSize := len(R) - if batchSize == 0 { - return - } - var lambda, lambdain [MAX_BATCH_SIZE]fp.Element +func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, batchSize int) { + var lambda, lambdain TC // add part for j := 0; j < batchSize; j++ { - lambdain[j].Sub(&P[j].X, &R[j].X) + lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X) } - // invert denominator - batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) + // invert denominator using montgomery batch invert technique + { + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < batchSize; i++ { + lambda[i] = accumulator + accumulator.Mul(&accumulator, &lambdain[i]) + } + + accumulator.Inverse(&accumulator) + + for i := batchSize - 1; i >= 0; i-- { + lambda[i].Mul(&lambda[i], &accumulator) + accumulator.Mul(&accumulator, &lambdain[i]) + } + } var d fp.Element var rr G1Affine @@ -1004,36 +1015,16 @@ func batchAddG1Affine(R []*G1Affine, P []G1Affine) { // add part for j := 0; j < batchSize; j++ { // computa lambda - d.Sub(&P[j].Y, &R[j].Y) + d.Sub(&(*P)[j].Y, &(*R)[j].Y) lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[j].X) - rr.X.Sub(&rr.X, &P[j].X) - d.Sub(&R[j].X, &rr.X) + rr.X.Sub(&rr.X, &(*R)[j].X) + rr.X.Sub(&rr.X, &(*P)[j].X) + d.Sub(&(*R)[j].X, &rr.X) rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[j].Y) - R[j].Set(&rr) - } -} - -// batch inversion -// similar to BatchInvertfp.Element, ignores edge cases -func batchInvertG1Affine(res, a []fp.Element) { - - var accumulator fp.Element - accumulator.SetOne() - - for i := 0; i < len(res); i++ { - res[i] = accumulator - accumulator.Mul(&accumulator, &a[i]) - } - - accumulator.Inverse(&accumulator) - - for i := len(res) - 1; i >= 0; i-- { - res[i].Mul(&res[i], &accumulator) - accumulator.Mul(&accumulator, &a[i]) + rr.Y.Sub(&rr.Y, &(*R)[j].Y) + (*R)[j].Set(&rr) } } diff --git a/ecc/bls12-377/g2.go b/ecc/bls12-377/g2.go index 92a51f4b54..0fe9a4119c 100644 --- a/ecc/bls12-377/g2.go +++ b/ecc/bls12-377/g2.go @@ -979,20 +979,31 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // batch add affine coordinates // using batch inversion // special cases (doubling, infinity) must be filtered out before this call -func batchAddG2Affine(R []*G2Affine, P []G2Affine) { - batchSize := len(R) - if batchSize == 0 { - return - } - var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2 +func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, batchSize int) { + var lambda, lambdain TC // add part for j := 0; j < batchSize; j++ { - lambdain[j].Sub(&P[j].X, &R[j].X) + lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X) } - // invert denominator - batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) + // invert denominator using montgomery batch invert technique + { + var accumulator fptower.E2 + accumulator.SetOne() + + for i := 0; i < batchSize; i++ { + lambda[i] = accumulator + accumulator.Mul(&accumulator, &lambdain[i]) + } + + accumulator.Inverse(&accumulator) + + for i := batchSize - 1; i >= 0; i-- { + lambda[i].Mul(&lambda[i], &accumulator) + accumulator.Mul(&accumulator, &lambdain[i]) + } + } var d fptower.E2 var rr G2Affine @@ -1000,36 +1011,16 @@ func batchAddG2Affine(R []*G2Affine, P []G2Affine) { // add part for j := 0; j < batchSize; j++ { // computa lambda - d.Sub(&P[j].Y, &R[j].Y) + d.Sub(&(*P)[j].Y, &(*R)[j].Y) lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[j].X) - rr.X.Sub(&rr.X, &P[j].X) - d.Sub(&R[j].X, &rr.X) + rr.X.Sub(&rr.X, &(*R)[j].X) + rr.X.Sub(&rr.X, &(*P)[j].X) + d.Sub(&(*R)[j].X, &rr.X) rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[j].Y) - R[j].Set(&rr) - } -} - -// batch inversion -// similar to BatchInvertfptower.E2, ignores edge cases -func batchInvertG2Affine(res, a []fptower.E2) { - - var accumulator fptower.E2 - accumulator.SetOne() - - for i := 0; i < len(res); i++ { - res[i] = accumulator - accumulator.Mul(&accumulator, &a[i]) - } - - accumulator.Inverse(&accumulator) - - for i := len(res) - 1; i >= 0; i-- { - res[i].Mul(&res[i], &accumulator) - accumulator.Mul(&accumulator, &a[i]) + rr.Y.Sub(&rr.Y, &(*R)[j].Y) + (*R)[j].Set(&rr) } } diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index 6360f5cb35..9f2a1998fc 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -174,31 +174,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10] + processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11] + processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12] + processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13] + processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14] + processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15] + processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -422,31 +422,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10] + processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11] + processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12] + processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13] + processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14] + processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15] + processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index eef9112dda..c1d32b5ded 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -16,15 +16,18 @@ package bls12377 -const MAX_BATCH_SIZE = 600 +import ( + "github.com/consensys/gnark-crypto/ecc/bls12-377/fp" + "github.com/consensys/gnark-crypto/ecc/bls12-377/internal/fptower" +) -type batchOp struct { - pointID uint32 +type batchOpG1Affine struct { bucketID uint16 + point G1Affine } -func (o batchOp) isNeg() bool { - return o.pointID&1 == 1 +func (o batchOpG1Affine) isNeg() bool { + return o.bucketID&1 == 1 } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -33,7 +36,8 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, +func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( + chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -47,22 +51,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // setup for the batch affine; // we do that instead of a separate object to give enough hints to the compiler to.. - // keep things on the stack. - batchSize := len(buckets) / 20 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - // bucket references - var R [MAX_BATCH_SIZE]*G1Affine + var R TPP // bucket references + var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - var P [MAX_BATCH_SIZE]G1Affine + batchSize := len(P) canAdd := func(bID uint16) bool { return !bucketIds[bID] @@ -76,76 +71,98 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, if (cptAdd) == 0 { return } - batchAddG1Affine(R[:cptAdd], P[:cptAdd]) + batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) var tmp BS bucketIds = tmp cptAdd = 0 } - add := func(op batchOp) { + addFromQueue := func(op batchOpG1Affine) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] - PP := &points[op.pointID>>1] - if PP.IsInfinity() { + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() return } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } + + add := func(bucketID uint16, PP *G1Affine, isAdd bool) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(PP) - } else { + if isAdd { BK.Set(PP) + } else { + BK.Neg(PP) } return } if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { - if op.isNeg() { - // P + -P - BK.setInfinity() - return - } - // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // P + P: doubling, which should be quite rare -- // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. // need doubling in affine implemented ? - BK.Add(BK, BK) + if isAdd { + BK.Add(BK, BK) + } else { + BK.setInfinity() + } + return } - // b.Y == -p.Y - if op.isNeg() { - // doubling . + if isAdd { + BK.setInfinity() + } else { BK.Add(BK, BK) - return } - BK.setInfinity() return } - bucketIds[op.bucketID] = true + bucketIds[bucketID] = true R[cptAdd] = BK - if op.isNeg() { - P[cptAdd].Neg(PP) - } else { + if isAdd { P[cptAdd].Set(PP) + } else { + P[cptAdd].Neg(PP) } cptAdd++ } - var queue [MAX_BATCH_SIZE]batchOp + var queue TQ qID := 0 processQueue := func() { for i := qID - 1; i >= 0; i-- { - if canAdd(queue[i].bucketID) { - add(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] - qID-- + if !canAdd(queue[i].bucketID) { + continue } + addFromQueue(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[qID-1] + qID-- } } @@ -154,7 +171,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, if !canAdd(queue[i].bucketID) { return } - add(queue[i]) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -164,40 +181,47 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, for i, digit := range digits { - if digit == 0 { + if digit == 0 || points[i].IsInfinity() { continue } - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if digit&1 == 0 { + bucketID := uint16((digit >> 1)) + isAdd := digit&1 == 0 + if isAdd { // add - op.bucketID = uint16((digit >> 1) - 1) - } else { - // sub - op.bucketID = (uint16((digit >> 1))) - op.pointID += 1 + bucketID -= 1 } - if canAdd(op.bucketID) { - add(op) - if isFull() { - executeAndReset() - processTopQueue() + + if !canAdd(bucketID) { + // put it in queue + queue[qID].bucketID = bucketID + if isAdd { + queue[qID].point = points[i] + } else { + queue[qID].point.Neg(&points[i]) } - } else { - // put it in queue. - queue[qID] = op qID++ - if qID == MAX_BATCH_SIZE-1 { + + // queue is full, flush it. + if qID == len(queue)-1 { executeAndReset() processQueue() } + continue + } + + // we add the point to the batch. + add(bucketID, &points[i], isAdd) + if isFull() { + executeAndReset() + processTopQueue() } } + // empty the queue for qID != 0 { processQueue() - executeAndReset() // execute batch even if not full. + executeAndReset() } // flush items in batch. @@ -222,12 +246,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG1AffineC4 [1 << (4 - 1)]G1Affine -type bucketG1AffineC5 [1 << (5 - 1)]G1Affine -type bucketG1AffineC6 [1 << (6 - 1)]G1Affine -type bucketG1AffineC7 [1 << (7 - 1)]G1Affine -type bucketG1AffineC8 [1 << (8 - 1)]G1Affine -type bucketG1AffineC9 [1 << (9 - 1)]G1Affine type bucketG1AffineC10 [1 << (10 - 1)]G1Affine type bucketG1AffineC11 [1 << (11 - 1)]G1Affine type bucketG1AffineC12 [1 << (12 - 1)]G1Affine @@ -236,14 +254,9 @@ type bucketG1AffineC14 [1 << (14 - 1)]G1Affine type bucketG1AffineC15 [1 << (15 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +// buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { - bucketG1AffineC4 | - bucketG1AffineC5 | - bucketG1AffineC6 | - bucketG1AffineC7 | - bucketG1AffineC8 | - bucketG1AffineC9 | - bucketG1AffineC10 | + bucketG1AffineC10 | bucketG1AffineC11 | bucketG1AffineC12 | bucketG1AffineC13 | @@ -252,13 +265,95 @@ type ibG1Affine interface { bucketG1AffineC16 } +// array of coordinates fp.Element +type cG1Affine interface { + cG1AffineC10 | + cG1AffineC11 | + cG1AffineC12 | + cG1AffineC13 | + cG1AffineC14 | + cG1AffineC15 | + cG1AffineC16 +} + +// buckets: array of G1Affine points (for the batch addition) +type pG1Affine interface { + pG1AffineC10 | + pG1AffineC11 | + pG1AffineC12 | + pG1AffineC13 | + pG1AffineC14 | + pG1AffineC15 | + pG1AffineC16 +} + +// buckets: array of *G1Affine points (for the batch addition) +type ppG1Affine interface { + ppG1AffineC10 | + ppG1AffineC11 | + ppG1AffineC12 | + ppG1AffineC13 | + ppG1AffineC14 | + ppG1AffineC15 | + ppG1AffineC16 +} + +// buckets: array of G1Affine queue operations (for the batch addition) +type qOpsG1Affine interface { + qOpsG1AffineC10 | + qOpsG1AffineC11 | + qOpsG1AffineC12 | + qOpsG1AffineC13 | + qOpsG1AffineC14 | + qOpsG1AffineC15 | + qOpsG1AffineC16 +} +type cG1AffineC10 [80]fp.Element +type pG1AffineC10 [80]G1Affine +type ppG1AffineC10 [80]*G1Affine +type qOpsG1AffineC10 [80]batchOpG1Affine +type cG1AffineC11 [150]fp.Element +type pG1AffineC11 [150]G1Affine +type ppG1AffineC11 [150]*G1Affine +type qOpsG1AffineC11 [150]batchOpG1Affine +type cG1AffineC12 [200]fp.Element +type pG1AffineC12 [200]G1Affine +type ppG1AffineC12 [200]*G1Affine +type qOpsG1AffineC12 [200]batchOpG1Affine +type cG1AffineC13 [350]fp.Element +type pG1AffineC13 [350]G1Affine +type ppG1AffineC13 [350]*G1Affine +type qOpsG1AffineC13 [350]batchOpG1Affine +type cG1AffineC14 [400]fp.Element +type pG1AffineC14 [400]G1Affine +type ppG1AffineC14 [400]*G1Affine +type qOpsG1AffineC14 [400]batchOpG1Affine +type cG1AffineC15 [500]fp.Element +type pG1AffineC15 [500]G1Affine +type ppG1AffineC15 [500]*G1Affine +type qOpsG1AffineC15 [500]batchOpG1Affine +type cG1AffineC16 [640]fp.Element +type pG1AffineC16 [640]G1Affine +type ppG1AffineC16 [640]*G1Affine +type qOpsG1AffineC16 [640]batchOpG1Affine + +type batchOpG2Affine struct { + bucketID uint16 + point G2Affine +} + +func (o batchOpG2Affine) isNeg() bool { + return o.bucketID&1 == 1 +} + // processChunkG2BatchAffine process a chunk of the scalars during the msm // using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition // we use a batch affine addition. // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, +func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( + chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -272,22 +367,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // setup for the batch affine; // we do that instead of a separate object to give enough hints to the compiler to.. - // keep things on the stack. - batchSize := len(buckets) / 20 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - // bucket references - var R [MAX_BATCH_SIZE]*G2Affine + var R TPP // bucket references + var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - var P [MAX_BATCH_SIZE]G2Affine + batchSize := len(P) canAdd := func(bID uint16) bool { return !bucketIds[bID] @@ -301,76 +387,98 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, if (cptAdd) == 0 { return } - batchAddG2Affine(R[:cptAdd], P[:cptAdd]) + batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) var tmp BS bucketIds = tmp cptAdd = 0 } - add := func(op batchOp) { + addFromQueue := func(op batchOpG2Affine) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] - PP := &points[op.pointID>>1] - if PP.IsInfinity() { + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() return } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } + + add := func(bucketID uint16, PP *G2Affine, isAdd bool) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(PP) - } else { + if isAdd { BK.Set(PP) + } else { + BK.Neg(PP) } return } if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { - if op.isNeg() { - // P + -P - BK.setInfinity() - return - } - // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // P + P: doubling, which should be quite rare -- // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. // need doubling in affine implemented ? - BK.Add(BK, BK) + if isAdd { + BK.Add(BK, BK) + } else { + BK.setInfinity() + } + return } - // b.Y == -p.Y - if op.isNeg() { - // doubling . + if isAdd { + BK.setInfinity() + } else { BK.Add(BK, BK) - return } - BK.setInfinity() return } - bucketIds[op.bucketID] = true + bucketIds[bucketID] = true R[cptAdd] = BK - if op.isNeg() { - P[cptAdd].Neg(PP) - } else { + if isAdd { P[cptAdd].Set(PP) + } else { + P[cptAdd].Neg(PP) } cptAdd++ } - var queue [MAX_BATCH_SIZE]batchOp + var queue TQ qID := 0 processQueue := func() { for i := qID - 1; i >= 0; i-- { - if canAdd(queue[i].bucketID) { - add(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] - qID-- + if !canAdd(queue[i].bucketID) { + continue + } + addFromQueue(queue[i]) + if isFull() { + executeAndReset() } + queue[i] = queue[qID-1] + qID-- } } @@ -379,7 +487,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, if !canAdd(queue[i].bucketID) { return } - add(queue[i]) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -389,40 +497,47 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, for i, digit := range digits { - if digit == 0 { + if digit == 0 || points[i].IsInfinity() { continue } - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if digit&1 == 0 { + bucketID := uint16((digit >> 1)) + isAdd := digit&1 == 0 + if isAdd { // add - op.bucketID = uint16((digit >> 1) - 1) - } else { - // sub - op.bucketID = (uint16((digit >> 1))) - op.pointID += 1 + bucketID -= 1 } - if canAdd(op.bucketID) { - add(op) - if isFull() { - executeAndReset() - processTopQueue() + + if !canAdd(bucketID) { + // put it in queue + queue[qID].bucketID = bucketID + if isAdd { + queue[qID].point = points[i] + } else { + queue[qID].point.Neg(&points[i]) } - } else { - // put it in queue. - queue[qID] = op qID++ - if qID == MAX_BATCH_SIZE-1 { + + // queue is full, flush it. + if qID == len(queue)-1 { executeAndReset() processQueue() } + continue + } + + // we add the point to the batch. + add(bucketID, &points[i], isAdd) + if isFull() { + executeAndReset() + processTopQueue() } } + // empty the queue for qID != 0 { processQueue() - executeAndReset() // execute batch even if not full. + executeAndReset() } // flush items in batch. @@ -447,12 +562,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG2AffineC4 [1 << (4 - 1)]G2Affine -type bucketG2AffineC5 [1 << (5 - 1)]G2Affine -type bucketG2AffineC6 [1 << (6 - 1)]G2Affine -type bucketG2AffineC7 [1 << (7 - 1)]G2Affine -type bucketG2AffineC8 [1 << (8 - 1)]G2Affine -type bucketG2AffineC9 [1 << (9 - 1)]G2Affine type bucketG2AffineC10 [1 << (10 - 1)]G2Affine type bucketG2AffineC11 [1 << (11 - 1)]G2Affine type bucketG2AffineC12 [1 << (12 - 1)]G2Affine @@ -461,14 +570,9 @@ type bucketG2AffineC14 [1 << (14 - 1)]G2Affine type bucketG2AffineC15 [1 << (15 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +// buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { - bucketG2AffineC4 | - bucketG2AffineC5 | - bucketG2AffineC6 | - bucketG2AffineC7 | - bucketG2AffineC8 | - bucketG2AffineC9 | - bucketG2AffineC10 | + bucketG2AffineC10 | bucketG2AffineC11 | bucketG2AffineC12 | bucketG2AffineC13 | @@ -477,6 +581,78 @@ type ibG2Affine interface { bucketG2AffineC16 } +// array of coordinates fptower.E2 +type cG2Affine interface { + cG2AffineC10 | + cG2AffineC11 | + cG2AffineC12 | + cG2AffineC13 | + cG2AffineC14 | + cG2AffineC15 | + cG2AffineC16 +} + +// buckets: array of G2Affine points (for the batch addition) +type pG2Affine interface { + pG2AffineC10 | + pG2AffineC11 | + pG2AffineC12 | + pG2AffineC13 | + pG2AffineC14 | + pG2AffineC15 | + pG2AffineC16 +} + +// buckets: array of *G2Affine points (for the batch addition) +type ppG2Affine interface { + ppG2AffineC10 | + ppG2AffineC11 | + ppG2AffineC12 | + ppG2AffineC13 | + ppG2AffineC14 | + ppG2AffineC15 | + ppG2AffineC16 +} + +// buckets: array of G2Affine queue operations (for the batch addition) +type qOpsG2Affine interface { + qOpsG2AffineC10 | + qOpsG2AffineC11 | + qOpsG2AffineC12 | + qOpsG2AffineC13 | + qOpsG2AffineC14 | + qOpsG2AffineC15 | + qOpsG2AffineC16 +} +type cG2AffineC10 [80]fptower.E2 +type pG2AffineC10 [80]G2Affine +type ppG2AffineC10 [80]*G2Affine +type qOpsG2AffineC10 [80]batchOpG2Affine +type cG2AffineC11 [150]fptower.E2 +type pG2AffineC11 [150]G2Affine +type ppG2AffineC11 [150]*G2Affine +type qOpsG2AffineC11 [150]batchOpG2Affine +type cG2AffineC12 [200]fptower.E2 +type pG2AffineC12 [200]G2Affine +type ppG2AffineC12 [200]*G2Affine +type qOpsG2AffineC12 [200]batchOpG2Affine +type cG2AffineC13 [350]fptower.E2 +type pG2AffineC13 [350]G2Affine +type ppG2AffineC13 [350]*G2Affine +type qOpsG2AffineC13 [350]batchOpG2Affine +type cG2AffineC14 [400]fptower.E2 +type pG2AffineC14 [400]G2Affine +type ppG2AffineC14 [400]*G2Affine +type qOpsG2AffineC14 [400]batchOpG2Affine +type cG2AffineC15 [500]fptower.E2 +type pG2AffineC15 [500]G2Affine +type ppG2AffineC15 [500]*G2Affine +type qOpsG2AffineC15 [500]batchOpG2Affine +type cG2AffineC16 [640]fptower.E2 +type pG2AffineC16 [640]G2Affine +type ppG2AffineC16 [640]*G2Affine +type qOpsG2AffineC16 [640]batchOpG2Affine + type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool type bitSetC6 [1 << (6 - 1)]bool diff --git a/ecc/bls12-378/g1.go b/ecc/bls12-378/g1.go index 08bd9e0fda..67d64790d7 100644 --- a/ecc/bls12-378/g1.go +++ b/ecc/bls12-378/g1.go @@ -983,20 +983,31 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // batch add affine coordinates // using batch inversion // special cases (doubling, infinity) must be filtered out before this call -func batchAddG1Affine(R []*G1Affine, P []G1Affine) { - batchSize := len(R) - if batchSize == 0 { - return - } - var lambda, lambdain [MAX_BATCH_SIZE]fp.Element +func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, batchSize int) { + var lambda, lambdain TC // add part for j := 0; j < batchSize; j++ { - lambdain[j].Sub(&P[j].X, &R[j].X) + lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X) } - // invert denominator - batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) + // invert denominator using montgomery batch invert technique + { + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < batchSize; i++ { + lambda[i] = accumulator + accumulator.Mul(&accumulator, &lambdain[i]) + } + + accumulator.Inverse(&accumulator) + + for i := batchSize - 1; i >= 0; i-- { + lambda[i].Mul(&lambda[i], &accumulator) + accumulator.Mul(&accumulator, &lambdain[i]) + } + } var d fp.Element var rr G1Affine @@ -1004,36 +1015,16 @@ func batchAddG1Affine(R []*G1Affine, P []G1Affine) { // add part for j := 0; j < batchSize; j++ { // computa lambda - d.Sub(&P[j].Y, &R[j].Y) + d.Sub(&(*P)[j].Y, &(*R)[j].Y) lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[j].X) - rr.X.Sub(&rr.X, &P[j].X) - d.Sub(&R[j].X, &rr.X) + rr.X.Sub(&rr.X, &(*R)[j].X) + rr.X.Sub(&rr.X, &(*P)[j].X) + d.Sub(&(*R)[j].X, &rr.X) rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[j].Y) - R[j].Set(&rr) - } -} - -// batch inversion -// similar to BatchInvertfp.Element, ignores edge cases -func batchInvertG1Affine(res, a []fp.Element) { - - var accumulator fp.Element - accumulator.SetOne() - - for i := 0; i < len(res); i++ { - res[i] = accumulator - accumulator.Mul(&accumulator, &a[i]) - } - - accumulator.Inverse(&accumulator) - - for i := len(res) - 1; i >= 0; i-- { - res[i].Mul(&res[i], &accumulator) - accumulator.Mul(&accumulator, &a[i]) + rr.Y.Sub(&rr.Y, &(*R)[j].Y) + (*R)[j].Set(&rr) } } diff --git a/ecc/bls12-378/g2.go b/ecc/bls12-378/g2.go index 840049be26..905e2ba893 100644 --- a/ecc/bls12-378/g2.go +++ b/ecc/bls12-378/g2.go @@ -979,20 +979,31 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // batch add affine coordinates // using batch inversion // special cases (doubling, infinity) must be filtered out before this call -func batchAddG2Affine(R []*G2Affine, P []G2Affine) { - batchSize := len(R) - if batchSize == 0 { - return - } - var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2 +func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, batchSize int) { + var lambda, lambdain TC // add part for j := 0; j < batchSize; j++ { - lambdain[j].Sub(&P[j].X, &R[j].X) + lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X) } - // invert denominator - batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) + // invert denominator using montgomery batch invert technique + { + var accumulator fptower.E2 + accumulator.SetOne() + + for i := 0; i < batchSize; i++ { + lambda[i] = accumulator + accumulator.Mul(&accumulator, &lambdain[i]) + } + + accumulator.Inverse(&accumulator) + + for i := batchSize - 1; i >= 0; i-- { + lambda[i].Mul(&lambda[i], &accumulator) + accumulator.Mul(&accumulator, &lambdain[i]) + } + } var d fptower.E2 var rr G2Affine @@ -1000,36 +1011,16 @@ func batchAddG2Affine(R []*G2Affine, P []G2Affine) { // add part for j := 0; j < batchSize; j++ { // computa lambda - d.Sub(&P[j].Y, &R[j].Y) + d.Sub(&(*P)[j].Y, &(*R)[j].Y) lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[j].X) - rr.X.Sub(&rr.X, &P[j].X) - d.Sub(&R[j].X, &rr.X) + rr.X.Sub(&rr.X, &(*R)[j].X) + rr.X.Sub(&rr.X, &(*P)[j].X) + d.Sub(&(*R)[j].X, &rr.X) rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[j].Y) - R[j].Set(&rr) - } -} - -// batch inversion -// similar to BatchInvertfptower.E2, ignores edge cases -func batchInvertG2Affine(res, a []fptower.E2) { - - var accumulator fptower.E2 - accumulator.SetOne() - - for i := 0; i < len(res); i++ { - res[i] = accumulator - accumulator.Mul(&accumulator, &a[i]) - } - - accumulator.Inverse(&accumulator) - - for i := len(res) - 1; i >= 0; i-- { - res[i].Mul(&res[i], &accumulator) - accumulator.Mul(&accumulator, &a[i]) + rr.Y.Sub(&rr.Y, &(*R)[j].Y) + (*R)[j].Set(&rr) } } diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index e88325d4ae..d65962591c 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -174,31 +174,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10] + processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11] + processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12] + processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13] + processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14] + processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15] + processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -422,31 +422,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10] + processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11] + processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12] + processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13] + processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14] + processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15] + processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index 6023381a17..f060ffc11a 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -16,15 +16,18 @@ package bls12378 -const MAX_BATCH_SIZE = 600 +import ( + "github.com/consensys/gnark-crypto/ecc/bls12-378/fp" + "github.com/consensys/gnark-crypto/ecc/bls12-378/internal/fptower" +) -type batchOp struct { - pointID uint32 +type batchOpG1Affine struct { bucketID uint16 + point G1Affine } -func (o batchOp) isNeg() bool { - return o.pointID&1 == 1 +func (o batchOpG1Affine) isNeg() bool { + return o.bucketID&1 == 1 } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -33,7 +36,8 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, +func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( + chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -47,22 +51,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // setup for the batch affine; // we do that instead of a separate object to give enough hints to the compiler to.. - // keep things on the stack. - batchSize := len(buckets) / 20 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - // bucket references - var R [MAX_BATCH_SIZE]*G1Affine + var R TPP // bucket references + var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - var P [MAX_BATCH_SIZE]G1Affine + batchSize := len(P) canAdd := func(bID uint16) bool { return !bucketIds[bID] @@ -76,76 +71,98 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, if (cptAdd) == 0 { return } - batchAddG1Affine(R[:cptAdd], P[:cptAdd]) + batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) var tmp BS bucketIds = tmp cptAdd = 0 } - add := func(op batchOp) { + addFromQueue := func(op batchOpG1Affine) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] - PP := &points[op.pointID>>1] - if PP.IsInfinity() { + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() return } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } + + add := func(bucketID uint16, PP *G1Affine, isAdd bool) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(PP) - } else { + if isAdd { BK.Set(PP) + } else { + BK.Neg(PP) } return } if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { - if op.isNeg() { - // P + -P - BK.setInfinity() - return - } - // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // P + P: doubling, which should be quite rare -- // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. // need doubling in affine implemented ? - BK.Add(BK, BK) + if isAdd { + BK.Add(BK, BK) + } else { + BK.setInfinity() + } + return } - // b.Y == -p.Y - if op.isNeg() { - // doubling . + if isAdd { + BK.setInfinity() + } else { BK.Add(BK, BK) - return } - BK.setInfinity() return } - bucketIds[op.bucketID] = true + bucketIds[bucketID] = true R[cptAdd] = BK - if op.isNeg() { - P[cptAdd].Neg(PP) - } else { + if isAdd { P[cptAdd].Set(PP) + } else { + P[cptAdd].Neg(PP) } cptAdd++ } - var queue [MAX_BATCH_SIZE]batchOp + var queue TQ qID := 0 processQueue := func() { for i := qID - 1; i >= 0; i-- { - if canAdd(queue[i].bucketID) { - add(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] - qID-- + if !canAdd(queue[i].bucketID) { + continue } + addFromQueue(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[qID-1] + qID-- } } @@ -154,7 +171,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, if !canAdd(queue[i].bucketID) { return } - add(queue[i]) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -164,40 +181,47 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, for i, digit := range digits { - if digit == 0 { + if digit == 0 || points[i].IsInfinity() { continue } - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if digit&1 == 0 { + bucketID := uint16((digit >> 1)) + isAdd := digit&1 == 0 + if isAdd { // add - op.bucketID = uint16((digit >> 1) - 1) - } else { - // sub - op.bucketID = (uint16((digit >> 1))) - op.pointID += 1 + bucketID -= 1 } - if canAdd(op.bucketID) { - add(op) - if isFull() { - executeAndReset() - processTopQueue() + + if !canAdd(bucketID) { + // put it in queue + queue[qID].bucketID = bucketID + if isAdd { + queue[qID].point = points[i] + } else { + queue[qID].point.Neg(&points[i]) } - } else { - // put it in queue. - queue[qID] = op qID++ - if qID == MAX_BATCH_SIZE-1 { + + // queue is full, flush it. + if qID == len(queue)-1 { executeAndReset() processQueue() } + continue + } + + // we add the point to the batch. + add(bucketID, &points[i], isAdd) + if isFull() { + executeAndReset() + processTopQueue() } } + // empty the queue for qID != 0 { processQueue() - executeAndReset() // execute batch even if not full. + executeAndReset() } // flush items in batch. @@ -222,12 +246,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG1AffineC4 [1 << (4 - 1)]G1Affine -type bucketG1AffineC5 [1 << (5 - 1)]G1Affine -type bucketG1AffineC6 [1 << (6 - 1)]G1Affine -type bucketG1AffineC7 [1 << (7 - 1)]G1Affine -type bucketG1AffineC8 [1 << (8 - 1)]G1Affine -type bucketG1AffineC9 [1 << (9 - 1)]G1Affine type bucketG1AffineC10 [1 << (10 - 1)]G1Affine type bucketG1AffineC11 [1 << (11 - 1)]G1Affine type bucketG1AffineC12 [1 << (12 - 1)]G1Affine @@ -236,14 +254,9 @@ type bucketG1AffineC14 [1 << (14 - 1)]G1Affine type bucketG1AffineC15 [1 << (15 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +// buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { - bucketG1AffineC4 | - bucketG1AffineC5 | - bucketG1AffineC6 | - bucketG1AffineC7 | - bucketG1AffineC8 | - bucketG1AffineC9 | - bucketG1AffineC10 | + bucketG1AffineC10 | bucketG1AffineC11 | bucketG1AffineC12 | bucketG1AffineC13 | @@ -252,13 +265,95 @@ type ibG1Affine interface { bucketG1AffineC16 } +// array of coordinates fp.Element +type cG1Affine interface { + cG1AffineC10 | + cG1AffineC11 | + cG1AffineC12 | + cG1AffineC13 | + cG1AffineC14 | + cG1AffineC15 | + cG1AffineC16 +} + +// buckets: array of G1Affine points (for the batch addition) +type pG1Affine interface { + pG1AffineC10 | + pG1AffineC11 | + pG1AffineC12 | + pG1AffineC13 | + pG1AffineC14 | + pG1AffineC15 | + pG1AffineC16 +} + +// buckets: array of *G1Affine points (for the batch addition) +type ppG1Affine interface { + ppG1AffineC10 | + ppG1AffineC11 | + ppG1AffineC12 | + ppG1AffineC13 | + ppG1AffineC14 | + ppG1AffineC15 | + ppG1AffineC16 +} + +// buckets: array of G1Affine queue operations (for the batch addition) +type qOpsG1Affine interface { + qOpsG1AffineC10 | + qOpsG1AffineC11 | + qOpsG1AffineC12 | + qOpsG1AffineC13 | + qOpsG1AffineC14 | + qOpsG1AffineC15 | + qOpsG1AffineC16 +} +type cG1AffineC10 [80]fp.Element +type pG1AffineC10 [80]G1Affine +type ppG1AffineC10 [80]*G1Affine +type qOpsG1AffineC10 [80]batchOpG1Affine +type cG1AffineC11 [150]fp.Element +type pG1AffineC11 [150]G1Affine +type ppG1AffineC11 [150]*G1Affine +type qOpsG1AffineC11 [150]batchOpG1Affine +type cG1AffineC12 [200]fp.Element +type pG1AffineC12 [200]G1Affine +type ppG1AffineC12 [200]*G1Affine +type qOpsG1AffineC12 [200]batchOpG1Affine +type cG1AffineC13 [350]fp.Element +type pG1AffineC13 [350]G1Affine +type ppG1AffineC13 [350]*G1Affine +type qOpsG1AffineC13 [350]batchOpG1Affine +type cG1AffineC14 [400]fp.Element +type pG1AffineC14 [400]G1Affine +type ppG1AffineC14 [400]*G1Affine +type qOpsG1AffineC14 [400]batchOpG1Affine +type cG1AffineC15 [500]fp.Element +type pG1AffineC15 [500]G1Affine +type ppG1AffineC15 [500]*G1Affine +type qOpsG1AffineC15 [500]batchOpG1Affine +type cG1AffineC16 [640]fp.Element +type pG1AffineC16 [640]G1Affine +type ppG1AffineC16 [640]*G1Affine +type qOpsG1AffineC16 [640]batchOpG1Affine + +type batchOpG2Affine struct { + bucketID uint16 + point G2Affine +} + +func (o batchOpG2Affine) isNeg() bool { + return o.bucketID&1 == 1 +} + // processChunkG2BatchAffine process a chunk of the scalars during the msm // using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition // we use a batch affine addition. // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, +func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( + chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -272,22 +367,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // setup for the batch affine; // we do that instead of a separate object to give enough hints to the compiler to.. - // keep things on the stack. - batchSize := len(buckets) / 20 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - // bucket references - var R [MAX_BATCH_SIZE]*G2Affine + var R TPP // bucket references + var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - var P [MAX_BATCH_SIZE]G2Affine + batchSize := len(P) canAdd := func(bID uint16) bool { return !bucketIds[bID] @@ -301,76 +387,98 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, if (cptAdd) == 0 { return } - batchAddG2Affine(R[:cptAdd], P[:cptAdd]) + batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) var tmp BS bucketIds = tmp cptAdd = 0 } - add := func(op batchOp) { + addFromQueue := func(op batchOpG2Affine) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] - PP := &points[op.pointID>>1] - if PP.IsInfinity() { + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() return } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } + + add := func(bucketID uint16, PP *G2Affine, isAdd bool) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(PP) - } else { + if isAdd { BK.Set(PP) + } else { + BK.Neg(PP) } return } if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { - if op.isNeg() { - // P + -P - BK.setInfinity() - return - } - // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // P + P: doubling, which should be quite rare -- // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. // need doubling in affine implemented ? - BK.Add(BK, BK) + if isAdd { + BK.Add(BK, BK) + } else { + BK.setInfinity() + } + return } - // b.Y == -p.Y - if op.isNeg() { - // doubling . + if isAdd { + BK.setInfinity() + } else { BK.Add(BK, BK) - return } - BK.setInfinity() return } - bucketIds[op.bucketID] = true + bucketIds[bucketID] = true R[cptAdd] = BK - if op.isNeg() { - P[cptAdd].Neg(PP) - } else { + if isAdd { P[cptAdd].Set(PP) + } else { + P[cptAdd].Neg(PP) } cptAdd++ } - var queue [MAX_BATCH_SIZE]batchOp + var queue TQ qID := 0 processQueue := func() { for i := qID - 1; i >= 0; i-- { - if canAdd(queue[i].bucketID) { - add(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] - qID-- + if !canAdd(queue[i].bucketID) { + continue + } + addFromQueue(queue[i]) + if isFull() { + executeAndReset() } + queue[i] = queue[qID-1] + qID-- } } @@ -379,7 +487,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, if !canAdd(queue[i].bucketID) { return } - add(queue[i]) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -389,40 +497,47 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, for i, digit := range digits { - if digit == 0 { + if digit == 0 || points[i].IsInfinity() { continue } - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if digit&1 == 0 { + bucketID := uint16((digit >> 1)) + isAdd := digit&1 == 0 + if isAdd { // add - op.bucketID = uint16((digit >> 1) - 1) - } else { - // sub - op.bucketID = (uint16((digit >> 1))) - op.pointID += 1 + bucketID -= 1 } - if canAdd(op.bucketID) { - add(op) - if isFull() { - executeAndReset() - processTopQueue() + + if !canAdd(bucketID) { + // put it in queue + queue[qID].bucketID = bucketID + if isAdd { + queue[qID].point = points[i] + } else { + queue[qID].point.Neg(&points[i]) } - } else { - // put it in queue. - queue[qID] = op qID++ - if qID == MAX_BATCH_SIZE-1 { + + // queue is full, flush it. + if qID == len(queue)-1 { executeAndReset() processQueue() } + continue + } + + // we add the point to the batch. + add(bucketID, &points[i], isAdd) + if isFull() { + executeAndReset() + processTopQueue() } } + // empty the queue for qID != 0 { processQueue() - executeAndReset() // execute batch even if not full. + executeAndReset() } // flush items in batch. @@ -447,12 +562,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG2AffineC4 [1 << (4 - 1)]G2Affine -type bucketG2AffineC5 [1 << (5 - 1)]G2Affine -type bucketG2AffineC6 [1 << (6 - 1)]G2Affine -type bucketG2AffineC7 [1 << (7 - 1)]G2Affine -type bucketG2AffineC8 [1 << (8 - 1)]G2Affine -type bucketG2AffineC9 [1 << (9 - 1)]G2Affine type bucketG2AffineC10 [1 << (10 - 1)]G2Affine type bucketG2AffineC11 [1 << (11 - 1)]G2Affine type bucketG2AffineC12 [1 << (12 - 1)]G2Affine @@ -461,14 +570,9 @@ type bucketG2AffineC14 [1 << (14 - 1)]G2Affine type bucketG2AffineC15 [1 << (15 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +// buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { - bucketG2AffineC4 | - bucketG2AffineC5 | - bucketG2AffineC6 | - bucketG2AffineC7 | - bucketG2AffineC8 | - bucketG2AffineC9 | - bucketG2AffineC10 | + bucketG2AffineC10 | bucketG2AffineC11 | bucketG2AffineC12 | bucketG2AffineC13 | @@ -477,6 +581,78 @@ type ibG2Affine interface { bucketG2AffineC16 } +// array of coordinates fptower.E2 +type cG2Affine interface { + cG2AffineC10 | + cG2AffineC11 | + cG2AffineC12 | + cG2AffineC13 | + cG2AffineC14 | + cG2AffineC15 | + cG2AffineC16 +} + +// buckets: array of G2Affine points (for the batch addition) +type pG2Affine interface { + pG2AffineC10 | + pG2AffineC11 | + pG2AffineC12 | + pG2AffineC13 | + pG2AffineC14 | + pG2AffineC15 | + pG2AffineC16 +} + +// buckets: array of *G2Affine points (for the batch addition) +type ppG2Affine interface { + ppG2AffineC10 | + ppG2AffineC11 | + ppG2AffineC12 | + ppG2AffineC13 | + ppG2AffineC14 | + ppG2AffineC15 | + ppG2AffineC16 +} + +// buckets: array of G2Affine queue operations (for the batch addition) +type qOpsG2Affine interface { + qOpsG2AffineC10 | + qOpsG2AffineC11 | + qOpsG2AffineC12 | + qOpsG2AffineC13 | + qOpsG2AffineC14 | + qOpsG2AffineC15 | + qOpsG2AffineC16 +} +type cG2AffineC10 [80]fptower.E2 +type pG2AffineC10 [80]G2Affine +type ppG2AffineC10 [80]*G2Affine +type qOpsG2AffineC10 [80]batchOpG2Affine +type cG2AffineC11 [150]fptower.E2 +type pG2AffineC11 [150]G2Affine +type ppG2AffineC11 [150]*G2Affine +type qOpsG2AffineC11 [150]batchOpG2Affine +type cG2AffineC12 [200]fptower.E2 +type pG2AffineC12 [200]G2Affine +type ppG2AffineC12 [200]*G2Affine +type qOpsG2AffineC12 [200]batchOpG2Affine +type cG2AffineC13 [350]fptower.E2 +type pG2AffineC13 [350]G2Affine +type ppG2AffineC13 [350]*G2Affine +type qOpsG2AffineC13 [350]batchOpG2Affine +type cG2AffineC14 [400]fptower.E2 +type pG2AffineC14 [400]G2Affine +type ppG2AffineC14 [400]*G2Affine +type qOpsG2AffineC14 [400]batchOpG2Affine +type cG2AffineC15 [500]fptower.E2 +type pG2AffineC15 [500]G2Affine +type ppG2AffineC15 [500]*G2Affine +type qOpsG2AffineC15 [500]batchOpG2Affine +type cG2AffineC16 [640]fptower.E2 +type pG2AffineC16 [640]G2Affine +type ppG2AffineC16 [640]*G2Affine +type qOpsG2AffineC16 [640]batchOpG2Affine + type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool type bitSetC6 [1 << (6 - 1)]bool diff --git a/ecc/bls12-381/g1.go b/ecc/bls12-381/g1.go index 6c3edabedc..474c868025 100644 --- a/ecc/bls12-381/g1.go +++ b/ecc/bls12-381/g1.go @@ -983,20 +983,31 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // batch add affine coordinates // using batch inversion // special cases (doubling, infinity) must be filtered out before this call -func batchAddG1Affine(R []*G1Affine, P []G1Affine) { - batchSize := len(R) - if batchSize == 0 { - return - } - var lambda, lambdain [MAX_BATCH_SIZE]fp.Element +func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, batchSize int) { + var lambda, lambdain TC // add part for j := 0; j < batchSize; j++ { - lambdain[j].Sub(&P[j].X, &R[j].X) + lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X) } - // invert denominator - batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) + // invert denominator using montgomery batch invert technique + { + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < batchSize; i++ { + lambda[i] = accumulator + accumulator.Mul(&accumulator, &lambdain[i]) + } + + accumulator.Inverse(&accumulator) + + for i := batchSize - 1; i >= 0; i-- { + lambda[i].Mul(&lambda[i], &accumulator) + accumulator.Mul(&accumulator, &lambdain[i]) + } + } var d fp.Element var rr G1Affine @@ -1004,36 +1015,16 @@ func batchAddG1Affine(R []*G1Affine, P []G1Affine) { // add part for j := 0; j < batchSize; j++ { // computa lambda - d.Sub(&P[j].Y, &R[j].Y) + d.Sub(&(*P)[j].Y, &(*R)[j].Y) lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[j].X) - rr.X.Sub(&rr.X, &P[j].X) - d.Sub(&R[j].X, &rr.X) + rr.X.Sub(&rr.X, &(*R)[j].X) + rr.X.Sub(&rr.X, &(*P)[j].X) + d.Sub(&(*R)[j].X, &rr.X) rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[j].Y) - R[j].Set(&rr) - } -} - -// batch inversion -// similar to BatchInvertfp.Element, ignores edge cases -func batchInvertG1Affine(res, a []fp.Element) { - - var accumulator fp.Element - accumulator.SetOne() - - for i := 0; i < len(res); i++ { - res[i] = accumulator - accumulator.Mul(&accumulator, &a[i]) - } - - accumulator.Inverse(&accumulator) - - for i := len(res) - 1; i >= 0; i-- { - res[i].Mul(&res[i], &accumulator) - accumulator.Mul(&accumulator, &a[i]) + rr.Y.Sub(&rr.Y, &(*R)[j].Y) + (*R)[j].Set(&rr) } } diff --git a/ecc/bls12-381/g2.go b/ecc/bls12-381/g2.go index 3768ec7eda..a8575f59f7 100644 --- a/ecc/bls12-381/g2.go +++ b/ecc/bls12-381/g2.go @@ -980,20 +980,31 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // batch add affine coordinates // using batch inversion // special cases (doubling, infinity) must be filtered out before this call -func batchAddG2Affine(R []*G2Affine, P []G2Affine) { - batchSize := len(R) - if batchSize == 0 { - return - } - var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2 +func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, batchSize int) { + var lambda, lambdain TC // add part for j := 0; j < batchSize; j++ { - lambdain[j].Sub(&P[j].X, &R[j].X) + lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X) } - // invert denominator - batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) + // invert denominator using montgomery batch invert technique + { + var accumulator fptower.E2 + accumulator.SetOne() + + for i := 0; i < batchSize; i++ { + lambda[i] = accumulator + accumulator.Mul(&accumulator, &lambdain[i]) + } + + accumulator.Inverse(&accumulator) + + for i := batchSize - 1; i >= 0; i-- { + lambda[i].Mul(&lambda[i], &accumulator) + accumulator.Mul(&accumulator, &lambdain[i]) + } + } var d fptower.E2 var rr G2Affine @@ -1001,36 +1012,16 @@ func batchAddG2Affine(R []*G2Affine, P []G2Affine) { // add part for j := 0; j < batchSize; j++ { // computa lambda - d.Sub(&P[j].Y, &R[j].Y) + d.Sub(&(*P)[j].Y, &(*R)[j].Y) lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[j].X) - rr.X.Sub(&rr.X, &P[j].X) - d.Sub(&R[j].X, &rr.X) + rr.X.Sub(&rr.X, &(*R)[j].X) + rr.X.Sub(&rr.X, &(*P)[j].X) + d.Sub(&(*R)[j].X, &rr.X) rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[j].Y) - R[j].Set(&rr) - } -} - -// batch inversion -// similar to BatchInvertfptower.E2, ignores edge cases -func batchInvertG2Affine(res, a []fptower.E2) { - - var accumulator fptower.E2 - accumulator.SetOne() - - for i := 0; i < len(res); i++ { - res[i] = accumulator - accumulator.Mul(&accumulator, &a[i]) - } - - accumulator.Inverse(&accumulator) - - for i := len(res) - 1; i >= 0; i-- { - res[i].Mul(&res[i], &accumulator) - accumulator.Mul(&accumulator, &a[i]) + rr.Y.Sub(&rr.Y, &(*R)[j].Y) + (*R)[j].Set(&rr) } } diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index fabd850c74..edcb161b5e 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -174,31 +174,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10] + processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11] + processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12] + processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13] + processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14] + processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15] + processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -422,31 +422,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10] + processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11] + processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12] + processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13] + processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14] + processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15] + processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index f388d3d5ea..da6be7a817 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -16,15 +16,18 @@ package bls12381 -const MAX_BATCH_SIZE = 600 +import ( + "github.com/consensys/gnark-crypto/ecc/bls12-381/fp" + "github.com/consensys/gnark-crypto/ecc/bls12-381/internal/fptower" +) -type batchOp struct { - pointID uint32 +type batchOpG1Affine struct { bucketID uint16 + point G1Affine } -func (o batchOp) isNeg() bool { - return o.pointID&1 == 1 +func (o batchOpG1Affine) isNeg() bool { + return o.bucketID&1 == 1 } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -33,7 +36,8 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, +func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( + chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -47,22 +51,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // setup for the batch affine; // we do that instead of a separate object to give enough hints to the compiler to.. - // keep things on the stack. - batchSize := len(buckets) / 20 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - // bucket references - var R [MAX_BATCH_SIZE]*G1Affine + var R TPP // bucket references + var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - var P [MAX_BATCH_SIZE]G1Affine + batchSize := len(P) canAdd := func(bID uint16) bool { return !bucketIds[bID] @@ -76,76 +71,98 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, if (cptAdd) == 0 { return } - batchAddG1Affine(R[:cptAdd], P[:cptAdd]) + batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) var tmp BS bucketIds = tmp cptAdd = 0 } - add := func(op batchOp) { + addFromQueue := func(op batchOpG1Affine) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] - PP := &points[op.pointID>>1] - if PP.IsInfinity() { + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() return } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } + + add := func(bucketID uint16, PP *G1Affine, isAdd bool) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(PP) - } else { + if isAdd { BK.Set(PP) + } else { + BK.Neg(PP) } return } if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { - if op.isNeg() { - // P + -P - BK.setInfinity() - return - } - // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // P + P: doubling, which should be quite rare -- // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. // need doubling in affine implemented ? - BK.Add(BK, BK) + if isAdd { + BK.Add(BK, BK) + } else { + BK.setInfinity() + } + return } - // b.Y == -p.Y - if op.isNeg() { - // doubling . + if isAdd { + BK.setInfinity() + } else { BK.Add(BK, BK) - return } - BK.setInfinity() return } - bucketIds[op.bucketID] = true + bucketIds[bucketID] = true R[cptAdd] = BK - if op.isNeg() { - P[cptAdd].Neg(PP) - } else { + if isAdd { P[cptAdd].Set(PP) + } else { + P[cptAdd].Neg(PP) } cptAdd++ } - var queue [MAX_BATCH_SIZE]batchOp + var queue TQ qID := 0 processQueue := func() { for i := qID - 1; i >= 0; i-- { - if canAdd(queue[i].bucketID) { - add(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] - qID-- + if !canAdd(queue[i].bucketID) { + continue } + addFromQueue(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[qID-1] + qID-- } } @@ -154,7 +171,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, if !canAdd(queue[i].bucketID) { return } - add(queue[i]) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -164,40 +181,47 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, for i, digit := range digits { - if digit == 0 { + if digit == 0 || points[i].IsInfinity() { continue } - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if digit&1 == 0 { + bucketID := uint16((digit >> 1)) + isAdd := digit&1 == 0 + if isAdd { // add - op.bucketID = uint16((digit >> 1) - 1) - } else { - // sub - op.bucketID = (uint16((digit >> 1))) - op.pointID += 1 + bucketID -= 1 } - if canAdd(op.bucketID) { - add(op) - if isFull() { - executeAndReset() - processTopQueue() + + if !canAdd(bucketID) { + // put it in queue + queue[qID].bucketID = bucketID + if isAdd { + queue[qID].point = points[i] + } else { + queue[qID].point.Neg(&points[i]) } - } else { - // put it in queue. - queue[qID] = op qID++ - if qID == MAX_BATCH_SIZE-1 { + + // queue is full, flush it. + if qID == len(queue)-1 { executeAndReset() processQueue() } + continue + } + + // we add the point to the batch. + add(bucketID, &points[i], isAdd) + if isFull() { + executeAndReset() + processTopQueue() } } + // empty the queue for qID != 0 { processQueue() - executeAndReset() // execute batch even if not full. + executeAndReset() } // flush items in batch. @@ -222,12 +246,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG1AffineC4 [1 << (4 - 1)]G1Affine -type bucketG1AffineC5 [1 << (5 - 1)]G1Affine -type bucketG1AffineC6 [1 << (6 - 1)]G1Affine -type bucketG1AffineC7 [1 << (7 - 1)]G1Affine -type bucketG1AffineC8 [1 << (8 - 1)]G1Affine -type bucketG1AffineC9 [1 << (9 - 1)]G1Affine type bucketG1AffineC10 [1 << (10 - 1)]G1Affine type bucketG1AffineC11 [1 << (11 - 1)]G1Affine type bucketG1AffineC12 [1 << (12 - 1)]G1Affine @@ -236,14 +254,9 @@ type bucketG1AffineC14 [1 << (14 - 1)]G1Affine type bucketG1AffineC15 [1 << (15 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +// buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { - bucketG1AffineC4 | - bucketG1AffineC5 | - bucketG1AffineC6 | - bucketG1AffineC7 | - bucketG1AffineC8 | - bucketG1AffineC9 | - bucketG1AffineC10 | + bucketG1AffineC10 | bucketG1AffineC11 | bucketG1AffineC12 | bucketG1AffineC13 | @@ -252,13 +265,95 @@ type ibG1Affine interface { bucketG1AffineC16 } +// array of coordinates fp.Element +type cG1Affine interface { + cG1AffineC10 | + cG1AffineC11 | + cG1AffineC12 | + cG1AffineC13 | + cG1AffineC14 | + cG1AffineC15 | + cG1AffineC16 +} + +// buckets: array of G1Affine points (for the batch addition) +type pG1Affine interface { + pG1AffineC10 | + pG1AffineC11 | + pG1AffineC12 | + pG1AffineC13 | + pG1AffineC14 | + pG1AffineC15 | + pG1AffineC16 +} + +// buckets: array of *G1Affine points (for the batch addition) +type ppG1Affine interface { + ppG1AffineC10 | + ppG1AffineC11 | + ppG1AffineC12 | + ppG1AffineC13 | + ppG1AffineC14 | + ppG1AffineC15 | + ppG1AffineC16 +} + +// buckets: array of G1Affine queue operations (for the batch addition) +type qOpsG1Affine interface { + qOpsG1AffineC10 | + qOpsG1AffineC11 | + qOpsG1AffineC12 | + qOpsG1AffineC13 | + qOpsG1AffineC14 | + qOpsG1AffineC15 | + qOpsG1AffineC16 +} +type cG1AffineC10 [80]fp.Element +type pG1AffineC10 [80]G1Affine +type ppG1AffineC10 [80]*G1Affine +type qOpsG1AffineC10 [80]batchOpG1Affine +type cG1AffineC11 [150]fp.Element +type pG1AffineC11 [150]G1Affine +type ppG1AffineC11 [150]*G1Affine +type qOpsG1AffineC11 [150]batchOpG1Affine +type cG1AffineC12 [200]fp.Element +type pG1AffineC12 [200]G1Affine +type ppG1AffineC12 [200]*G1Affine +type qOpsG1AffineC12 [200]batchOpG1Affine +type cG1AffineC13 [350]fp.Element +type pG1AffineC13 [350]G1Affine +type ppG1AffineC13 [350]*G1Affine +type qOpsG1AffineC13 [350]batchOpG1Affine +type cG1AffineC14 [400]fp.Element +type pG1AffineC14 [400]G1Affine +type ppG1AffineC14 [400]*G1Affine +type qOpsG1AffineC14 [400]batchOpG1Affine +type cG1AffineC15 [500]fp.Element +type pG1AffineC15 [500]G1Affine +type ppG1AffineC15 [500]*G1Affine +type qOpsG1AffineC15 [500]batchOpG1Affine +type cG1AffineC16 [640]fp.Element +type pG1AffineC16 [640]G1Affine +type ppG1AffineC16 [640]*G1Affine +type qOpsG1AffineC16 [640]batchOpG1Affine + +type batchOpG2Affine struct { + bucketID uint16 + point G2Affine +} + +func (o batchOpG2Affine) isNeg() bool { + return o.bucketID&1 == 1 +} + // processChunkG2BatchAffine process a chunk of the scalars during the msm // using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition // we use a batch affine addition. // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, +func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( + chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -272,22 +367,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // setup for the batch affine; // we do that instead of a separate object to give enough hints to the compiler to.. - // keep things on the stack. - batchSize := len(buckets) / 20 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - // bucket references - var R [MAX_BATCH_SIZE]*G2Affine + var R TPP // bucket references + var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - var P [MAX_BATCH_SIZE]G2Affine + batchSize := len(P) canAdd := func(bID uint16) bool { return !bucketIds[bID] @@ -301,76 +387,98 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, if (cptAdd) == 0 { return } - batchAddG2Affine(R[:cptAdd], P[:cptAdd]) + batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) var tmp BS bucketIds = tmp cptAdd = 0 } - add := func(op batchOp) { + addFromQueue := func(op batchOpG2Affine) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] - PP := &points[op.pointID>>1] - if PP.IsInfinity() { + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() return } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } + + add := func(bucketID uint16, PP *G2Affine, isAdd bool) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(PP) - } else { + if isAdd { BK.Set(PP) + } else { + BK.Neg(PP) } return } if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { - if op.isNeg() { - // P + -P - BK.setInfinity() - return - } - // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // P + P: doubling, which should be quite rare -- // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. // need doubling in affine implemented ? - BK.Add(BK, BK) + if isAdd { + BK.Add(BK, BK) + } else { + BK.setInfinity() + } + return } - // b.Y == -p.Y - if op.isNeg() { - // doubling . + if isAdd { + BK.setInfinity() + } else { BK.Add(BK, BK) - return } - BK.setInfinity() return } - bucketIds[op.bucketID] = true + bucketIds[bucketID] = true R[cptAdd] = BK - if op.isNeg() { - P[cptAdd].Neg(PP) - } else { + if isAdd { P[cptAdd].Set(PP) + } else { + P[cptAdd].Neg(PP) } cptAdd++ } - var queue [MAX_BATCH_SIZE]batchOp + var queue TQ qID := 0 processQueue := func() { for i := qID - 1; i >= 0; i-- { - if canAdd(queue[i].bucketID) { - add(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] - qID-- + if !canAdd(queue[i].bucketID) { + continue + } + addFromQueue(queue[i]) + if isFull() { + executeAndReset() } + queue[i] = queue[qID-1] + qID-- } } @@ -379,7 +487,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, if !canAdd(queue[i].bucketID) { return } - add(queue[i]) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -389,40 +497,47 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, for i, digit := range digits { - if digit == 0 { + if digit == 0 || points[i].IsInfinity() { continue } - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if digit&1 == 0 { + bucketID := uint16((digit >> 1)) + isAdd := digit&1 == 0 + if isAdd { // add - op.bucketID = uint16((digit >> 1) - 1) - } else { - // sub - op.bucketID = (uint16((digit >> 1))) - op.pointID += 1 + bucketID -= 1 } - if canAdd(op.bucketID) { - add(op) - if isFull() { - executeAndReset() - processTopQueue() + + if !canAdd(bucketID) { + // put it in queue + queue[qID].bucketID = bucketID + if isAdd { + queue[qID].point = points[i] + } else { + queue[qID].point.Neg(&points[i]) } - } else { - // put it in queue. - queue[qID] = op qID++ - if qID == MAX_BATCH_SIZE-1 { + + // queue is full, flush it. + if qID == len(queue)-1 { executeAndReset() processQueue() } + continue + } + + // we add the point to the batch. + add(bucketID, &points[i], isAdd) + if isFull() { + executeAndReset() + processTopQueue() } } + // empty the queue for qID != 0 { processQueue() - executeAndReset() // execute batch even if not full. + executeAndReset() } // flush items in batch. @@ -447,12 +562,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG2AffineC4 [1 << (4 - 1)]G2Affine -type bucketG2AffineC5 [1 << (5 - 1)]G2Affine -type bucketG2AffineC6 [1 << (6 - 1)]G2Affine -type bucketG2AffineC7 [1 << (7 - 1)]G2Affine -type bucketG2AffineC8 [1 << (8 - 1)]G2Affine -type bucketG2AffineC9 [1 << (9 - 1)]G2Affine type bucketG2AffineC10 [1 << (10 - 1)]G2Affine type bucketG2AffineC11 [1 << (11 - 1)]G2Affine type bucketG2AffineC12 [1 << (12 - 1)]G2Affine @@ -461,14 +570,9 @@ type bucketG2AffineC14 [1 << (14 - 1)]G2Affine type bucketG2AffineC15 [1 << (15 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +// buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { - bucketG2AffineC4 | - bucketG2AffineC5 | - bucketG2AffineC6 | - bucketG2AffineC7 | - bucketG2AffineC8 | - bucketG2AffineC9 | - bucketG2AffineC10 | + bucketG2AffineC10 | bucketG2AffineC11 | bucketG2AffineC12 | bucketG2AffineC13 | @@ -477,6 +581,78 @@ type ibG2Affine interface { bucketG2AffineC16 } +// array of coordinates fptower.E2 +type cG2Affine interface { + cG2AffineC10 | + cG2AffineC11 | + cG2AffineC12 | + cG2AffineC13 | + cG2AffineC14 | + cG2AffineC15 | + cG2AffineC16 +} + +// buckets: array of G2Affine points (for the batch addition) +type pG2Affine interface { + pG2AffineC10 | + pG2AffineC11 | + pG2AffineC12 | + pG2AffineC13 | + pG2AffineC14 | + pG2AffineC15 | + pG2AffineC16 +} + +// buckets: array of *G2Affine points (for the batch addition) +type ppG2Affine interface { + ppG2AffineC10 | + ppG2AffineC11 | + ppG2AffineC12 | + ppG2AffineC13 | + ppG2AffineC14 | + ppG2AffineC15 | + ppG2AffineC16 +} + +// buckets: array of G2Affine queue operations (for the batch addition) +type qOpsG2Affine interface { + qOpsG2AffineC10 | + qOpsG2AffineC11 | + qOpsG2AffineC12 | + qOpsG2AffineC13 | + qOpsG2AffineC14 | + qOpsG2AffineC15 | + qOpsG2AffineC16 +} +type cG2AffineC10 [80]fptower.E2 +type pG2AffineC10 [80]G2Affine +type ppG2AffineC10 [80]*G2Affine +type qOpsG2AffineC10 [80]batchOpG2Affine +type cG2AffineC11 [150]fptower.E2 +type pG2AffineC11 [150]G2Affine +type ppG2AffineC11 [150]*G2Affine +type qOpsG2AffineC11 [150]batchOpG2Affine +type cG2AffineC12 [200]fptower.E2 +type pG2AffineC12 [200]G2Affine +type ppG2AffineC12 [200]*G2Affine +type qOpsG2AffineC12 [200]batchOpG2Affine +type cG2AffineC13 [350]fptower.E2 +type pG2AffineC13 [350]G2Affine +type ppG2AffineC13 [350]*G2Affine +type qOpsG2AffineC13 [350]batchOpG2Affine +type cG2AffineC14 [400]fptower.E2 +type pG2AffineC14 [400]G2Affine +type ppG2AffineC14 [400]*G2Affine +type qOpsG2AffineC14 [400]batchOpG2Affine +type cG2AffineC15 [500]fptower.E2 +type pG2AffineC15 [500]G2Affine +type ppG2AffineC15 [500]*G2Affine +type qOpsG2AffineC15 [500]batchOpG2Affine +type cG2AffineC16 [640]fptower.E2 +type pG2AffineC16 [640]G2Affine +type ppG2AffineC16 [640]*G2Affine +type qOpsG2AffineC16 [640]batchOpG2Affine + type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool type bitSetC6 [1 << (6 - 1)]bool diff --git a/ecc/bls24-315/g1.go b/ecc/bls24-315/g1.go index 25faa396cf..bde8a50d43 100644 --- a/ecc/bls24-315/g1.go +++ b/ecc/bls24-315/g1.go @@ -985,20 +985,31 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // batch add affine coordinates // using batch inversion // special cases (doubling, infinity) must be filtered out before this call -func batchAddG1Affine(R []*G1Affine, P []G1Affine) { - batchSize := len(R) - if batchSize == 0 { - return - } - var lambda, lambdain [MAX_BATCH_SIZE]fp.Element +func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, batchSize int) { + var lambda, lambdain TC // add part for j := 0; j < batchSize; j++ { - lambdain[j].Sub(&P[j].X, &R[j].X) + lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X) } - // invert denominator - batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) + // invert denominator using montgomery batch invert technique + { + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < batchSize; i++ { + lambda[i] = accumulator + accumulator.Mul(&accumulator, &lambdain[i]) + } + + accumulator.Inverse(&accumulator) + + for i := batchSize - 1; i >= 0; i-- { + lambda[i].Mul(&lambda[i], &accumulator) + accumulator.Mul(&accumulator, &lambdain[i]) + } + } var d fp.Element var rr G1Affine @@ -1006,36 +1017,16 @@ func batchAddG1Affine(R []*G1Affine, P []G1Affine) { // add part for j := 0; j < batchSize; j++ { // computa lambda - d.Sub(&P[j].Y, &R[j].Y) + d.Sub(&(*P)[j].Y, &(*R)[j].Y) lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[j].X) - rr.X.Sub(&rr.X, &P[j].X) - d.Sub(&R[j].X, &rr.X) + rr.X.Sub(&rr.X, &(*R)[j].X) + rr.X.Sub(&rr.X, &(*P)[j].X) + d.Sub(&(*R)[j].X, &rr.X) rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[j].Y) - R[j].Set(&rr) - } -} - -// batch inversion -// similar to BatchInvertfp.Element, ignores edge cases -func batchInvertG1Affine(res, a []fp.Element) { - - var accumulator fp.Element - accumulator.SetOne() - - for i := 0; i < len(res); i++ { - res[i] = accumulator - accumulator.Mul(&accumulator, &a[i]) - } - - accumulator.Inverse(&accumulator) - - for i := len(res) - 1; i >= 0; i-- { - res[i].Mul(&res[i], &accumulator) - accumulator.Mul(&accumulator, &a[i]) + rr.Y.Sub(&rr.Y, &(*R)[j].Y) + (*R)[j].Set(&rr) } } diff --git a/ecc/bls24-315/g2.go b/ecc/bls24-315/g2.go index 32601c0b08..662bfe0313 100644 --- a/ecc/bls24-315/g2.go +++ b/ecc/bls24-315/g2.go @@ -995,20 +995,31 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // batch add affine coordinates // using batch inversion // special cases (doubling, infinity) must be filtered out before this call -func batchAddG2Affine(R []*G2Affine, P []G2Affine) { - batchSize := len(R) - if batchSize == 0 { - return - } - var lambda, lambdain [MAX_BATCH_SIZE]fptower.E4 +func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, batchSize int) { + var lambda, lambdain TC // add part for j := 0; j < batchSize; j++ { - lambdain[j].Sub(&P[j].X, &R[j].X) + lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X) } - // invert denominator - batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) + // invert denominator using montgomery batch invert technique + { + var accumulator fptower.E4 + accumulator.SetOne() + + for i := 0; i < batchSize; i++ { + lambda[i] = accumulator + accumulator.Mul(&accumulator, &lambdain[i]) + } + + accumulator.Inverse(&accumulator) + + for i := batchSize - 1; i >= 0; i-- { + lambda[i].Mul(&lambda[i], &accumulator) + accumulator.Mul(&accumulator, &lambdain[i]) + } + } var d fptower.E4 var rr G2Affine @@ -1016,36 +1027,16 @@ func batchAddG2Affine(R []*G2Affine, P []G2Affine) { // add part for j := 0; j < batchSize; j++ { // computa lambda - d.Sub(&P[j].Y, &R[j].Y) + d.Sub(&(*P)[j].Y, &(*R)[j].Y) lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[j].X) - rr.X.Sub(&rr.X, &P[j].X) - d.Sub(&R[j].X, &rr.X) + rr.X.Sub(&rr.X, &(*R)[j].X) + rr.X.Sub(&rr.X, &(*P)[j].X) + d.Sub(&(*R)[j].X, &rr.X) rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[j].Y) - R[j].Set(&rr) - } -} - -// batch inversion -// similar to BatchInvertfptower.E4, ignores edge cases -func batchInvertG2Affine(res, a []fptower.E4) { - - var accumulator fptower.E4 - accumulator.SetOne() - - for i := 0; i < len(res); i++ { - res[i] = accumulator - accumulator.Mul(&accumulator, &a[i]) - } - - accumulator.Inverse(&accumulator) - - for i := len(res) - 1; i >= 0; i-- { - res[i].Mul(&res[i], &accumulator) - accumulator.Mul(&accumulator, &a[i]) + rr.Y.Sub(&rr.Y, &(*R)[j].Y) + (*R)[j].Set(&rr) } } diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index 207a4c7f23..ebaf6a86f7 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -174,31 +174,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10] + processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11] + processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12] + processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13] + processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14] + processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15] + processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -422,31 +422,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10] + processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11] + processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12] + processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13] + processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14] + processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15] + processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index 9ec9c35382..800c106b7d 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -16,15 +16,18 @@ package bls24315 -const MAX_BATCH_SIZE = 600 +import ( + "github.com/consensys/gnark-crypto/ecc/bls24-315/fp" + "github.com/consensys/gnark-crypto/ecc/bls24-315/internal/fptower" +) -type batchOp struct { - pointID uint32 +type batchOpG1Affine struct { bucketID uint16 + point G1Affine } -func (o batchOp) isNeg() bool { - return o.pointID&1 == 1 +func (o batchOpG1Affine) isNeg() bool { + return o.bucketID&1 == 1 } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -33,7 +36,8 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, +func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( + chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -47,22 +51,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // setup for the batch affine; // we do that instead of a separate object to give enough hints to the compiler to.. - // keep things on the stack. - batchSize := len(buckets) / 20 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - // bucket references - var R [MAX_BATCH_SIZE]*G1Affine + var R TPP // bucket references + var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - var P [MAX_BATCH_SIZE]G1Affine + batchSize := len(P) canAdd := func(bID uint16) bool { return !bucketIds[bID] @@ -76,76 +71,98 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, if (cptAdd) == 0 { return } - batchAddG1Affine(R[:cptAdd], P[:cptAdd]) + batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) var tmp BS bucketIds = tmp cptAdd = 0 } - add := func(op batchOp) { + addFromQueue := func(op batchOpG1Affine) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] - PP := &points[op.pointID>>1] - if PP.IsInfinity() { + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() return } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } + + add := func(bucketID uint16, PP *G1Affine, isAdd bool) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(PP) - } else { + if isAdd { BK.Set(PP) + } else { + BK.Neg(PP) } return } if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { - if op.isNeg() { - // P + -P - BK.setInfinity() - return - } - // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // P + P: doubling, which should be quite rare -- // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. // need doubling in affine implemented ? - BK.Add(BK, BK) + if isAdd { + BK.Add(BK, BK) + } else { + BK.setInfinity() + } + return } - // b.Y == -p.Y - if op.isNeg() { - // doubling . + if isAdd { + BK.setInfinity() + } else { BK.Add(BK, BK) - return } - BK.setInfinity() return } - bucketIds[op.bucketID] = true + bucketIds[bucketID] = true R[cptAdd] = BK - if op.isNeg() { - P[cptAdd].Neg(PP) - } else { + if isAdd { P[cptAdd].Set(PP) + } else { + P[cptAdd].Neg(PP) } cptAdd++ } - var queue [MAX_BATCH_SIZE]batchOp + var queue TQ qID := 0 processQueue := func() { for i := qID - 1; i >= 0; i-- { - if canAdd(queue[i].bucketID) { - add(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] - qID-- + if !canAdd(queue[i].bucketID) { + continue } + addFromQueue(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[qID-1] + qID-- } } @@ -154,7 +171,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, if !canAdd(queue[i].bucketID) { return } - add(queue[i]) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -164,40 +181,47 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, for i, digit := range digits { - if digit == 0 { + if digit == 0 || points[i].IsInfinity() { continue } - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if digit&1 == 0 { + bucketID := uint16((digit >> 1)) + isAdd := digit&1 == 0 + if isAdd { // add - op.bucketID = uint16((digit >> 1) - 1) - } else { - // sub - op.bucketID = (uint16((digit >> 1))) - op.pointID += 1 + bucketID -= 1 } - if canAdd(op.bucketID) { - add(op) - if isFull() { - executeAndReset() - processTopQueue() + + if !canAdd(bucketID) { + // put it in queue + queue[qID].bucketID = bucketID + if isAdd { + queue[qID].point = points[i] + } else { + queue[qID].point.Neg(&points[i]) } - } else { - // put it in queue. - queue[qID] = op qID++ - if qID == MAX_BATCH_SIZE-1 { + + // queue is full, flush it. + if qID == len(queue)-1 { executeAndReset() processQueue() } + continue + } + + // we add the point to the batch. + add(bucketID, &points[i], isAdd) + if isFull() { + executeAndReset() + processTopQueue() } } + // empty the queue for qID != 0 { processQueue() - executeAndReset() // execute batch even if not full. + executeAndReset() } // flush items in batch. @@ -222,12 +246,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG1AffineC4 [1 << (4 - 1)]G1Affine -type bucketG1AffineC5 [1 << (5 - 1)]G1Affine -type bucketG1AffineC6 [1 << (6 - 1)]G1Affine -type bucketG1AffineC7 [1 << (7 - 1)]G1Affine -type bucketG1AffineC8 [1 << (8 - 1)]G1Affine -type bucketG1AffineC9 [1 << (9 - 1)]G1Affine type bucketG1AffineC10 [1 << (10 - 1)]G1Affine type bucketG1AffineC11 [1 << (11 - 1)]G1Affine type bucketG1AffineC12 [1 << (12 - 1)]G1Affine @@ -236,14 +254,9 @@ type bucketG1AffineC14 [1 << (14 - 1)]G1Affine type bucketG1AffineC15 [1 << (15 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +// buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { - bucketG1AffineC4 | - bucketG1AffineC5 | - bucketG1AffineC6 | - bucketG1AffineC7 | - bucketG1AffineC8 | - bucketG1AffineC9 | - bucketG1AffineC10 | + bucketG1AffineC10 | bucketG1AffineC11 | bucketG1AffineC12 | bucketG1AffineC13 | @@ -252,13 +265,95 @@ type ibG1Affine interface { bucketG1AffineC16 } +// array of coordinates fp.Element +type cG1Affine interface { + cG1AffineC10 | + cG1AffineC11 | + cG1AffineC12 | + cG1AffineC13 | + cG1AffineC14 | + cG1AffineC15 | + cG1AffineC16 +} + +// buckets: array of G1Affine points (for the batch addition) +type pG1Affine interface { + pG1AffineC10 | + pG1AffineC11 | + pG1AffineC12 | + pG1AffineC13 | + pG1AffineC14 | + pG1AffineC15 | + pG1AffineC16 +} + +// buckets: array of *G1Affine points (for the batch addition) +type ppG1Affine interface { + ppG1AffineC10 | + ppG1AffineC11 | + ppG1AffineC12 | + ppG1AffineC13 | + ppG1AffineC14 | + ppG1AffineC15 | + ppG1AffineC16 +} + +// buckets: array of G1Affine queue operations (for the batch addition) +type qOpsG1Affine interface { + qOpsG1AffineC10 | + qOpsG1AffineC11 | + qOpsG1AffineC12 | + qOpsG1AffineC13 | + qOpsG1AffineC14 | + qOpsG1AffineC15 | + qOpsG1AffineC16 +} +type cG1AffineC10 [80]fp.Element +type pG1AffineC10 [80]G1Affine +type ppG1AffineC10 [80]*G1Affine +type qOpsG1AffineC10 [80]batchOpG1Affine +type cG1AffineC11 [150]fp.Element +type pG1AffineC11 [150]G1Affine +type ppG1AffineC11 [150]*G1Affine +type qOpsG1AffineC11 [150]batchOpG1Affine +type cG1AffineC12 [200]fp.Element +type pG1AffineC12 [200]G1Affine +type ppG1AffineC12 [200]*G1Affine +type qOpsG1AffineC12 [200]batchOpG1Affine +type cG1AffineC13 [350]fp.Element +type pG1AffineC13 [350]G1Affine +type ppG1AffineC13 [350]*G1Affine +type qOpsG1AffineC13 [350]batchOpG1Affine +type cG1AffineC14 [400]fp.Element +type pG1AffineC14 [400]G1Affine +type ppG1AffineC14 [400]*G1Affine +type qOpsG1AffineC14 [400]batchOpG1Affine +type cG1AffineC15 [500]fp.Element +type pG1AffineC15 [500]G1Affine +type ppG1AffineC15 [500]*G1Affine +type qOpsG1AffineC15 [500]batchOpG1Affine +type cG1AffineC16 [640]fp.Element +type pG1AffineC16 [640]G1Affine +type ppG1AffineC16 [640]*G1Affine +type qOpsG1AffineC16 [640]batchOpG1Affine + +type batchOpG2Affine struct { + bucketID uint16 + point G2Affine +} + +func (o batchOpG2Affine) isNeg() bool { + return o.bucketID&1 == 1 +} + // processChunkG2BatchAffine process a chunk of the scalars during the msm // using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition // we use a batch affine addition. // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, +func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( + chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -272,22 +367,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // setup for the batch affine; // we do that instead of a separate object to give enough hints to the compiler to.. - // keep things on the stack. - batchSize := len(buckets) / 20 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - // bucket references - var R [MAX_BATCH_SIZE]*G2Affine + var R TPP // bucket references + var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - var P [MAX_BATCH_SIZE]G2Affine + batchSize := len(P) canAdd := func(bID uint16) bool { return !bucketIds[bID] @@ -301,76 +387,98 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, if (cptAdd) == 0 { return } - batchAddG2Affine(R[:cptAdd], P[:cptAdd]) + batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) var tmp BS bucketIds = tmp cptAdd = 0 } - add := func(op batchOp) { + addFromQueue := func(op batchOpG2Affine) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] - PP := &points[op.pointID>>1] - if PP.IsInfinity() { + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() return } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } + + add := func(bucketID uint16, PP *G2Affine, isAdd bool) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(PP) - } else { + if isAdd { BK.Set(PP) + } else { + BK.Neg(PP) } return } if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { - if op.isNeg() { - // P + -P - BK.setInfinity() - return - } - // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // P + P: doubling, which should be quite rare -- // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. // need doubling in affine implemented ? - BK.Add(BK, BK) + if isAdd { + BK.Add(BK, BK) + } else { + BK.setInfinity() + } + return } - // b.Y == -p.Y - if op.isNeg() { - // doubling . + if isAdd { + BK.setInfinity() + } else { BK.Add(BK, BK) - return } - BK.setInfinity() return } - bucketIds[op.bucketID] = true + bucketIds[bucketID] = true R[cptAdd] = BK - if op.isNeg() { - P[cptAdd].Neg(PP) - } else { + if isAdd { P[cptAdd].Set(PP) + } else { + P[cptAdd].Neg(PP) } cptAdd++ } - var queue [MAX_BATCH_SIZE]batchOp + var queue TQ qID := 0 processQueue := func() { for i := qID - 1; i >= 0; i-- { - if canAdd(queue[i].bucketID) { - add(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] - qID-- + if !canAdd(queue[i].bucketID) { + continue + } + addFromQueue(queue[i]) + if isFull() { + executeAndReset() } + queue[i] = queue[qID-1] + qID-- } } @@ -379,7 +487,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, if !canAdd(queue[i].bucketID) { return } - add(queue[i]) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -389,40 +497,47 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, for i, digit := range digits { - if digit == 0 { + if digit == 0 || points[i].IsInfinity() { continue } - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if digit&1 == 0 { + bucketID := uint16((digit >> 1)) + isAdd := digit&1 == 0 + if isAdd { // add - op.bucketID = uint16((digit >> 1) - 1) - } else { - // sub - op.bucketID = (uint16((digit >> 1))) - op.pointID += 1 + bucketID -= 1 } - if canAdd(op.bucketID) { - add(op) - if isFull() { - executeAndReset() - processTopQueue() + + if !canAdd(bucketID) { + // put it in queue + queue[qID].bucketID = bucketID + if isAdd { + queue[qID].point = points[i] + } else { + queue[qID].point.Neg(&points[i]) } - } else { - // put it in queue. - queue[qID] = op qID++ - if qID == MAX_BATCH_SIZE-1 { + + // queue is full, flush it. + if qID == len(queue)-1 { executeAndReset() processQueue() } + continue + } + + // we add the point to the batch. + add(bucketID, &points[i], isAdd) + if isFull() { + executeAndReset() + processTopQueue() } } + // empty the queue for qID != 0 { processQueue() - executeAndReset() // execute batch even if not full. + executeAndReset() } // flush items in batch. @@ -447,12 +562,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG2AffineC4 [1 << (4 - 1)]G2Affine -type bucketG2AffineC5 [1 << (5 - 1)]G2Affine -type bucketG2AffineC6 [1 << (6 - 1)]G2Affine -type bucketG2AffineC7 [1 << (7 - 1)]G2Affine -type bucketG2AffineC8 [1 << (8 - 1)]G2Affine -type bucketG2AffineC9 [1 << (9 - 1)]G2Affine type bucketG2AffineC10 [1 << (10 - 1)]G2Affine type bucketG2AffineC11 [1 << (11 - 1)]G2Affine type bucketG2AffineC12 [1 << (12 - 1)]G2Affine @@ -461,14 +570,9 @@ type bucketG2AffineC14 [1 << (14 - 1)]G2Affine type bucketG2AffineC15 [1 << (15 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +// buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { - bucketG2AffineC4 | - bucketG2AffineC5 | - bucketG2AffineC6 | - bucketG2AffineC7 | - bucketG2AffineC8 | - bucketG2AffineC9 | - bucketG2AffineC10 | + bucketG2AffineC10 | bucketG2AffineC11 | bucketG2AffineC12 | bucketG2AffineC13 | @@ -477,6 +581,78 @@ type ibG2Affine interface { bucketG2AffineC16 } +// array of coordinates fptower.E4 +type cG2Affine interface { + cG2AffineC10 | + cG2AffineC11 | + cG2AffineC12 | + cG2AffineC13 | + cG2AffineC14 | + cG2AffineC15 | + cG2AffineC16 +} + +// buckets: array of G2Affine points (for the batch addition) +type pG2Affine interface { + pG2AffineC10 | + pG2AffineC11 | + pG2AffineC12 | + pG2AffineC13 | + pG2AffineC14 | + pG2AffineC15 | + pG2AffineC16 +} + +// buckets: array of *G2Affine points (for the batch addition) +type ppG2Affine interface { + ppG2AffineC10 | + ppG2AffineC11 | + ppG2AffineC12 | + ppG2AffineC13 | + ppG2AffineC14 | + ppG2AffineC15 | + ppG2AffineC16 +} + +// buckets: array of G2Affine queue operations (for the batch addition) +type qOpsG2Affine interface { + qOpsG2AffineC10 | + qOpsG2AffineC11 | + qOpsG2AffineC12 | + qOpsG2AffineC13 | + qOpsG2AffineC14 | + qOpsG2AffineC15 | + qOpsG2AffineC16 +} +type cG2AffineC10 [80]fptower.E4 +type pG2AffineC10 [80]G2Affine +type ppG2AffineC10 [80]*G2Affine +type qOpsG2AffineC10 [80]batchOpG2Affine +type cG2AffineC11 [150]fptower.E4 +type pG2AffineC11 [150]G2Affine +type ppG2AffineC11 [150]*G2Affine +type qOpsG2AffineC11 [150]batchOpG2Affine +type cG2AffineC12 [200]fptower.E4 +type pG2AffineC12 [200]G2Affine +type ppG2AffineC12 [200]*G2Affine +type qOpsG2AffineC12 [200]batchOpG2Affine +type cG2AffineC13 [350]fptower.E4 +type pG2AffineC13 [350]G2Affine +type ppG2AffineC13 [350]*G2Affine +type qOpsG2AffineC13 [350]batchOpG2Affine +type cG2AffineC14 [400]fptower.E4 +type pG2AffineC14 [400]G2Affine +type ppG2AffineC14 [400]*G2Affine +type qOpsG2AffineC14 [400]batchOpG2Affine +type cG2AffineC15 [500]fptower.E4 +type pG2AffineC15 [500]G2Affine +type ppG2AffineC15 [500]*G2Affine +type qOpsG2AffineC15 [500]batchOpG2Affine +type cG2AffineC16 [640]fptower.E4 +type pG2AffineC16 [640]G2Affine +type ppG2AffineC16 [640]*G2Affine +type qOpsG2AffineC16 [640]batchOpG2Affine + type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool type bitSetC6 [1 << (6 - 1)]bool diff --git a/ecc/bls24-317/g1.go b/ecc/bls24-317/g1.go index f1e3773049..cd9452b1ce 100644 --- a/ecc/bls24-317/g1.go +++ b/ecc/bls24-317/g1.go @@ -985,20 +985,31 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // batch add affine coordinates // using batch inversion // special cases (doubling, infinity) must be filtered out before this call -func batchAddG1Affine(R []*G1Affine, P []G1Affine) { - batchSize := len(R) - if batchSize == 0 { - return - } - var lambda, lambdain [MAX_BATCH_SIZE]fp.Element +func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, batchSize int) { + var lambda, lambdain TC // add part for j := 0; j < batchSize; j++ { - lambdain[j].Sub(&P[j].X, &R[j].X) + lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X) } - // invert denominator - batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) + // invert denominator using montgomery batch invert technique + { + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < batchSize; i++ { + lambda[i] = accumulator + accumulator.Mul(&accumulator, &lambdain[i]) + } + + accumulator.Inverse(&accumulator) + + for i := batchSize - 1; i >= 0; i-- { + lambda[i].Mul(&lambda[i], &accumulator) + accumulator.Mul(&accumulator, &lambdain[i]) + } + } var d fp.Element var rr G1Affine @@ -1006,36 +1017,16 @@ func batchAddG1Affine(R []*G1Affine, P []G1Affine) { // add part for j := 0; j < batchSize; j++ { // computa lambda - d.Sub(&P[j].Y, &R[j].Y) + d.Sub(&(*P)[j].Y, &(*R)[j].Y) lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[j].X) - rr.X.Sub(&rr.X, &P[j].X) - d.Sub(&R[j].X, &rr.X) + rr.X.Sub(&rr.X, &(*R)[j].X) + rr.X.Sub(&rr.X, &(*P)[j].X) + d.Sub(&(*R)[j].X, &rr.X) rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[j].Y) - R[j].Set(&rr) - } -} - -// batch inversion -// similar to BatchInvertfp.Element, ignores edge cases -func batchInvertG1Affine(res, a []fp.Element) { - - var accumulator fp.Element - accumulator.SetOne() - - for i := 0; i < len(res); i++ { - res[i] = accumulator - accumulator.Mul(&accumulator, &a[i]) - } - - accumulator.Inverse(&accumulator) - - for i := len(res) - 1; i >= 0; i-- { - res[i].Mul(&res[i], &accumulator) - accumulator.Mul(&accumulator, &a[i]) + rr.Y.Sub(&rr.Y, &(*R)[j].Y) + (*R)[j].Set(&rr) } } diff --git a/ecc/bls24-317/g2.go b/ecc/bls24-317/g2.go index 0f1693cdc8..96d823eaf9 100644 --- a/ecc/bls24-317/g2.go +++ b/ecc/bls24-317/g2.go @@ -995,20 +995,31 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // batch add affine coordinates // using batch inversion // special cases (doubling, infinity) must be filtered out before this call -func batchAddG2Affine(R []*G2Affine, P []G2Affine) { - batchSize := len(R) - if batchSize == 0 { - return - } - var lambda, lambdain [MAX_BATCH_SIZE]fptower.E4 +func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, batchSize int) { + var lambda, lambdain TC // add part for j := 0; j < batchSize; j++ { - lambdain[j].Sub(&P[j].X, &R[j].X) + lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X) } - // invert denominator - batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) + // invert denominator using montgomery batch invert technique + { + var accumulator fptower.E4 + accumulator.SetOne() + + for i := 0; i < batchSize; i++ { + lambda[i] = accumulator + accumulator.Mul(&accumulator, &lambdain[i]) + } + + accumulator.Inverse(&accumulator) + + for i := batchSize - 1; i >= 0; i-- { + lambda[i].Mul(&lambda[i], &accumulator) + accumulator.Mul(&accumulator, &lambdain[i]) + } + } var d fptower.E4 var rr G2Affine @@ -1016,36 +1027,16 @@ func batchAddG2Affine(R []*G2Affine, P []G2Affine) { // add part for j := 0; j < batchSize; j++ { // computa lambda - d.Sub(&P[j].Y, &R[j].Y) + d.Sub(&(*P)[j].Y, &(*R)[j].Y) lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[j].X) - rr.X.Sub(&rr.X, &P[j].X) - d.Sub(&R[j].X, &rr.X) + rr.X.Sub(&rr.X, &(*R)[j].X) + rr.X.Sub(&rr.X, &(*P)[j].X) + d.Sub(&(*R)[j].X, &rr.X) rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[j].Y) - R[j].Set(&rr) - } -} - -// batch inversion -// similar to BatchInvertfptower.E4, ignores edge cases -func batchInvertG2Affine(res, a []fptower.E4) { - - var accumulator fptower.E4 - accumulator.SetOne() - - for i := 0; i < len(res); i++ { - res[i] = accumulator - accumulator.Mul(&accumulator, &a[i]) - } - - accumulator.Inverse(&accumulator) - - for i := len(res) - 1; i >= 0; i-- { - res[i].Mul(&res[i], &accumulator) - accumulator.Mul(&accumulator, &a[i]) + rr.Y.Sub(&rr.Y, &(*R)[j].Y) + (*R)[j].Set(&rr) } } diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index b9baa2cec7..c05f920246 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -174,31 +174,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10] + processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11] + processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12] + processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13] + processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14] + processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15] + processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -422,31 +422,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10] + processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11] + processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12] + processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13] + processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14] + processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15] + processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index e27eb9efeb..f1fb40dea1 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -16,15 +16,18 @@ package bls24317 -const MAX_BATCH_SIZE = 600 +import ( + "github.com/consensys/gnark-crypto/ecc/bls24-317/fp" + "github.com/consensys/gnark-crypto/ecc/bls24-317/internal/fptower" +) -type batchOp struct { - pointID uint32 +type batchOpG1Affine struct { bucketID uint16 + point G1Affine } -func (o batchOp) isNeg() bool { - return o.pointID&1 == 1 +func (o batchOpG1Affine) isNeg() bool { + return o.bucketID&1 == 1 } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -33,7 +36,8 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, +func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( + chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -47,22 +51,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // setup for the batch affine; // we do that instead of a separate object to give enough hints to the compiler to.. - // keep things on the stack. - batchSize := len(buckets) / 20 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - // bucket references - var R [MAX_BATCH_SIZE]*G1Affine + var R TPP // bucket references + var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - var P [MAX_BATCH_SIZE]G1Affine + batchSize := len(P) canAdd := func(bID uint16) bool { return !bucketIds[bID] @@ -76,76 +71,98 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, if (cptAdd) == 0 { return } - batchAddG1Affine(R[:cptAdd], P[:cptAdd]) + batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) var tmp BS bucketIds = tmp cptAdd = 0 } - add := func(op batchOp) { + addFromQueue := func(op batchOpG1Affine) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] - PP := &points[op.pointID>>1] - if PP.IsInfinity() { + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() return } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } + + add := func(bucketID uint16, PP *G1Affine, isAdd bool) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(PP) - } else { + if isAdd { BK.Set(PP) + } else { + BK.Neg(PP) } return } if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { - if op.isNeg() { - // P + -P - BK.setInfinity() - return - } - // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // P + P: doubling, which should be quite rare -- // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. // need doubling in affine implemented ? - BK.Add(BK, BK) + if isAdd { + BK.Add(BK, BK) + } else { + BK.setInfinity() + } + return } - // b.Y == -p.Y - if op.isNeg() { - // doubling . + if isAdd { + BK.setInfinity() + } else { BK.Add(BK, BK) - return } - BK.setInfinity() return } - bucketIds[op.bucketID] = true + bucketIds[bucketID] = true R[cptAdd] = BK - if op.isNeg() { - P[cptAdd].Neg(PP) - } else { + if isAdd { P[cptAdd].Set(PP) + } else { + P[cptAdd].Neg(PP) } cptAdd++ } - var queue [MAX_BATCH_SIZE]batchOp + var queue TQ qID := 0 processQueue := func() { for i := qID - 1; i >= 0; i-- { - if canAdd(queue[i].bucketID) { - add(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] - qID-- + if !canAdd(queue[i].bucketID) { + continue } + addFromQueue(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[qID-1] + qID-- } } @@ -154,7 +171,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, if !canAdd(queue[i].bucketID) { return } - add(queue[i]) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -164,40 +181,47 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, for i, digit := range digits { - if digit == 0 { + if digit == 0 || points[i].IsInfinity() { continue } - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if digit&1 == 0 { + bucketID := uint16((digit >> 1)) + isAdd := digit&1 == 0 + if isAdd { // add - op.bucketID = uint16((digit >> 1) - 1) - } else { - // sub - op.bucketID = (uint16((digit >> 1))) - op.pointID += 1 + bucketID -= 1 } - if canAdd(op.bucketID) { - add(op) - if isFull() { - executeAndReset() - processTopQueue() + + if !canAdd(bucketID) { + // put it in queue + queue[qID].bucketID = bucketID + if isAdd { + queue[qID].point = points[i] + } else { + queue[qID].point.Neg(&points[i]) } - } else { - // put it in queue. - queue[qID] = op qID++ - if qID == MAX_BATCH_SIZE-1 { + + // queue is full, flush it. + if qID == len(queue)-1 { executeAndReset() processQueue() } + continue + } + + // we add the point to the batch. + add(bucketID, &points[i], isAdd) + if isFull() { + executeAndReset() + processTopQueue() } } + // empty the queue for qID != 0 { processQueue() - executeAndReset() // execute batch even if not full. + executeAndReset() } // flush items in batch. @@ -222,12 +246,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG1AffineC4 [1 << (4 - 1)]G1Affine -type bucketG1AffineC5 [1 << (5 - 1)]G1Affine -type bucketG1AffineC6 [1 << (6 - 1)]G1Affine -type bucketG1AffineC7 [1 << (7 - 1)]G1Affine -type bucketG1AffineC8 [1 << (8 - 1)]G1Affine -type bucketG1AffineC9 [1 << (9 - 1)]G1Affine type bucketG1AffineC10 [1 << (10 - 1)]G1Affine type bucketG1AffineC11 [1 << (11 - 1)]G1Affine type bucketG1AffineC12 [1 << (12 - 1)]G1Affine @@ -236,14 +254,9 @@ type bucketG1AffineC14 [1 << (14 - 1)]G1Affine type bucketG1AffineC15 [1 << (15 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +// buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { - bucketG1AffineC4 | - bucketG1AffineC5 | - bucketG1AffineC6 | - bucketG1AffineC7 | - bucketG1AffineC8 | - bucketG1AffineC9 | - bucketG1AffineC10 | + bucketG1AffineC10 | bucketG1AffineC11 | bucketG1AffineC12 | bucketG1AffineC13 | @@ -252,13 +265,95 @@ type ibG1Affine interface { bucketG1AffineC16 } +// array of coordinates fp.Element +type cG1Affine interface { + cG1AffineC10 | + cG1AffineC11 | + cG1AffineC12 | + cG1AffineC13 | + cG1AffineC14 | + cG1AffineC15 | + cG1AffineC16 +} + +// buckets: array of G1Affine points (for the batch addition) +type pG1Affine interface { + pG1AffineC10 | + pG1AffineC11 | + pG1AffineC12 | + pG1AffineC13 | + pG1AffineC14 | + pG1AffineC15 | + pG1AffineC16 +} + +// buckets: array of *G1Affine points (for the batch addition) +type ppG1Affine interface { + ppG1AffineC10 | + ppG1AffineC11 | + ppG1AffineC12 | + ppG1AffineC13 | + ppG1AffineC14 | + ppG1AffineC15 | + ppG1AffineC16 +} + +// buckets: array of G1Affine queue operations (for the batch addition) +type qOpsG1Affine interface { + qOpsG1AffineC10 | + qOpsG1AffineC11 | + qOpsG1AffineC12 | + qOpsG1AffineC13 | + qOpsG1AffineC14 | + qOpsG1AffineC15 | + qOpsG1AffineC16 +} +type cG1AffineC10 [80]fp.Element +type pG1AffineC10 [80]G1Affine +type ppG1AffineC10 [80]*G1Affine +type qOpsG1AffineC10 [80]batchOpG1Affine +type cG1AffineC11 [150]fp.Element +type pG1AffineC11 [150]G1Affine +type ppG1AffineC11 [150]*G1Affine +type qOpsG1AffineC11 [150]batchOpG1Affine +type cG1AffineC12 [200]fp.Element +type pG1AffineC12 [200]G1Affine +type ppG1AffineC12 [200]*G1Affine +type qOpsG1AffineC12 [200]batchOpG1Affine +type cG1AffineC13 [350]fp.Element +type pG1AffineC13 [350]G1Affine +type ppG1AffineC13 [350]*G1Affine +type qOpsG1AffineC13 [350]batchOpG1Affine +type cG1AffineC14 [400]fp.Element +type pG1AffineC14 [400]G1Affine +type ppG1AffineC14 [400]*G1Affine +type qOpsG1AffineC14 [400]batchOpG1Affine +type cG1AffineC15 [500]fp.Element +type pG1AffineC15 [500]G1Affine +type ppG1AffineC15 [500]*G1Affine +type qOpsG1AffineC15 [500]batchOpG1Affine +type cG1AffineC16 [640]fp.Element +type pG1AffineC16 [640]G1Affine +type ppG1AffineC16 [640]*G1Affine +type qOpsG1AffineC16 [640]batchOpG1Affine + +type batchOpG2Affine struct { + bucketID uint16 + point G2Affine +} + +func (o batchOpG2Affine) isNeg() bool { + return o.bucketID&1 == 1 +} + // processChunkG2BatchAffine process a chunk of the scalars during the msm // using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition // we use a batch affine addition. // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, +func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( + chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -272,22 +367,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // setup for the batch affine; // we do that instead of a separate object to give enough hints to the compiler to.. - // keep things on the stack. - batchSize := len(buckets) / 20 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - // bucket references - var R [MAX_BATCH_SIZE]*G2Affine + var R TPP // bucket references + var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - var P [MAX_BATCH_SIZE]G2Affine + batchSize := len(P) canAdd := func(bID uint16) bool { return !bucketIds[bID] @@ -301,76 +387,98 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, if (cptAdd) == 0 { return } - batchAddG2Affine(R[:cptAdd], P[:cptAdd]) + batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) var tmp BS bucketIds = tmp cptAdd = 0 } - add := func(op batchOp) { + addFromQueue := func(op batchOpG2Affine) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] - PP := &points[op.pointID>>1] - if PP.IsInfinity() { + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() return } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } + + add := func(bucketID uint16, PP *G2Affine, isAdd bool) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(PP) - } else { + if isAdd { BK.Set(PP) + } else { + BK.Neg(PP) } return } if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { - if op.isNeg() { - // P + -P - BK.setInfinity() - return - } - // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // P + P: doubling, which should be quite rare -- // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. // need doubling in affine implemented ? - BK.Add(BK, BK) + if isAdd { + BK.Add(BK, BK) + } else { + BK.setInfinity() + } + return } - // b.Y == -p.Y - if op.isNeg() { - // doubling . + if isAdd { + BK.setInfinity() + } else { BK.Add(BK, BK) - return } - BK.setInfinity() return } - bucketIds[op.bucketID] = true + bucketIds[bucketID] = true R[cptAdd] = BK - if op.isNeg() { - P[cptAdd].Neg(PP) - } else { + if isAdd { P[cptAdd].Set(PP) + } else { + P[cptAdd].Neg(PP) } cptAdd++ } - var queue [MAX_BATCH_SIZE]batchOp + var queue TQ qID := 0 processQueue := func() { for i := qID - 1; i >= 0; i-- { - if canAdd(queue[i].bucketID) { - add(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] - qID-- + if !canAdd(queue[i].bucketID) { + continue + } + addFromQueue(queue[i]) + if isFull() { + executeAndReset() } + queue[i] = queue[qID-1] + qID-- } } @@ -379,7 +487,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, if !canAdd(queue[i].bucketID) { return } - add(queue[i]) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -389,40 +497,47 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, for i, digit := range digits { - if digit == 0 { + if digit == 0 || points[i].IsInfinity() { continue } - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if digit&1 == 0 { + bucketID := uint16((digit >> 1)) + isAdd := digit&1 == 0 + if isAdd { // add - op.bucketID = uint16((digit >> 1) - 1) - } else { - // sub - op.bucketID = (uint16((digit >> 1))) - op.pointID += 1 + bucketID -= 1 } - if canAdd(op.bucketID) { - add(op) - if isFull() { - executeAndReset() - processTopQueue() + + if !canAdd(bucketID) { + // put it in queue + queue[qID].bucketID = bucketID + if isAdd { + queue[qID].point = points[i] + } else { + queue[qID].point.Neg(&points[i]) } - } else { - // put it in queue. - queue[qID] = op qID++ - if qID == MAX_BATCH_SIZE-1 { + + // queue is full, flush it. + if qID == len(queue)-1 { executeAndReset() processQueue() } + continue + } + + // we add the point to the batch. + add(bucketID, &points[i], isAdd) + if isFull() { + executeAndReset() + processTopQueue() } } + // empty the queue for qID != 0 { processQueue() - executeAndReset() // execute batch even if not full. + executeAndReset() } // flush items in batch. @@ -447,12 +562,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG2AffineC4 [1 << (4 - 1)]G2Affine -type bucketG2AffineC5 [1 << (5 - 1)]G2Affine -type bucketG2AffineC6 [1 << (6 - 1)]G2Affine -type bucketG2AffineC7 [1 << (7 - 1)]G2Affine -type bucketG2AffineC8 [1 << (8 - 1)]G2Affine -type bucketG2AffineC9 [1 << (9 - 1)]G2Affine type bucketG2AffineC10 [1 << (10 - 1)]G2Affine type bucketG2AffineC11 [1 << (11 - 1)]G2Affine type bucketG2AffineC12 [1 << (12 - 1)]G2Affine @@ -461,14 +570,9 @@ type bucketG2AffineC14 [1 << (14 - 1)]G2Affine type bucketG2AffineC15 [1 << (15 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +// buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { - bucketG2AffineC4 | - bucketG2AffineC5 | - bucketG2AffineC6 | - bucketG2AffineC7 | - bucketG2AffineC8 | - bucketG2AffineC9 | - bucketG2AffineC10 | + bucketG2AffineC10 | bucketG2AffineC11 | bucketG2AffineC12 | bucketG2AffineC13 | @@ -477,6 +581,78 @@ type ibG2Affine interface { bucketG2AffineC16 } +// array of coordinates fptower.E4 +type cG2Affine interface { + cG2AffineC10 | + cG2AffineC11 | + cG2AffineC12 | + cG2AffineC13 | + cG2AffineC14 | + cG2AffineC15 | + cG2AffineC16 +} + +// buckets: array of G2Affine points (for the batch addition) +type pG2Affine interface { + pG2AffineC10 | + pG2AffineC11 | + pG2AffineC12 | + pG2AffineC13 | + pG2AffineC14 | + pG2AffineC15 | + pG2AffineC16 +} + +// buckets: array of *G2Affine points (for the batch addition) +type ppG2Affine interface { + ppG2AffineC10 | + ppG2AffineC11 | + ppG2AffineC12 | + ppG2AffineC13 | + ppG2AffineC14 | + ppG2AffineC15 | + ppG2AffineC16 +} + +// buckets: array of G2Affine queue operations (for the batch addition) +type qOpsG2Affine interface { + qOpsG2AffineC10 | + qOpsG2AffineC11 | + qOpsG2AffineC12 | + qOpsG2AffineC13 | + qOpsG2AffineC14 | + qOpsG2AffineC15 | + qOpsG2AffineC16 +} +type cG2AffineC10 [80]fptower.E4 +type pG2AffineC10 [80]G2Affine +type ppG2AffineC10 [80]*G2Affine +type qOpsG2AffineC10 [80]batchOpG2Affine +type cG2AffineC11 [150]fptower.E4 +type pG2AffineC11 [150]G2Affine +type ppG2AffineC11 [150]*G2Affine +type qOpsG2AffineC11 [150]batchOpG2Affine +type cG2AffineC12 [200]fptower.E4 +type pG2AffineC12 [200]G2Affine +type ppG2AffineC12 [200]*G2Affine +type qOpsG2AffineC12 [200]batchOpG2Affine +type cG2AffineC13 [350]fptower.E4 +type pG2AffineC13 [350]G2Affine +type ppG2AffineC13 [350]*G2Affine +type qOpsG2AffineC13 [350]batchOpG2Affine +type cG2AffineC14 [400]fptower.E4 +type pG2AffineC14 [400]G2Affine +type ppG2AffineC14 [400]*G2Affine +type qOpsG2AffineC14 [400]batchOpG2Affine +type cG2AffineC15 [500]fptower.E4 +type pG2AffineC15 [500]G2Affine +type ppG2AffineC15 [500]*G2Affine +type qOpsG2AffineC15 [500]batchOpG2Affine +type cG2AffineC16 [640]fptower.E4 +type pG2AffineC16 [640]G2Affine +type ppG2AffineC16 [640]*G2Affine +type qOpsG2AffineC16 [640]batchOpG2Affine + type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool type bitSetC6 [1 << (6 - 1)]bool diff --git a/ecc/bn254/g1.go b/ecc/bn254/g1.go index 75a3e25983..5bad4b316c 100644 --- a/ecc/bn254/g1.go +++ b/ecc/bn254/g1.go @@ -955,20 +955,31 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // batch add affine coordinates // using batch inversion // special cases (doubling, infinity) must be filtered out before this call -func batchAddG1Affine(R []*G1Affine, P []G1Affine) { - batchSize := len(R) - if batchSize == 0 { - return - } - var lambda, lambdain [MAX_BATCH_SIZE]fp.Element +func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, batchSize int) { + var lambda, lambdain TC // add part for j := 0; j < batchSize; j++ { - lambdain[j].Sub(&P[j].X, &R[j].X) + lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X) } - // invert denominator - batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) + // invert denominator using montgomery batch invert technique + { + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < batchSize; i++ { + lambda[i] = accumulator + accumulator.Mul(&accumulator, &lambdain[i]) + } + + accumulator.Inverse(&accumulator) + + for i := batchSize - 1; i >= 0; i-- { + lambda[i].Mul(&lambda[i], &accumulator) + accumulator.Mul(&accumulator, &lambdain[i]) + } + } var d fp.Element var rr G1Affine @@ -976,36 +987,16 @@ func batchAddG1Affine(R []*G1Affine, P []G1Affine) { // add part for j := 0; j < batchSize; j++ { // computa lambda - d.Sub(&P[j].Y, &R[j].Y) + d.Sub(&(*P)[j].Y, &(*R)[j].Y) lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[j].X) - rr.X.Sub(&rr.X, &P[j].X) - d.Sub(&R[j].X, &rr.X) + rr.X.Sub(&rr.X, &(*R)[j].X) + rr.X.Sub(&rr.X, &(*P)[j].X) + d.Sub(&(*R)[j].X, &rr.X) rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[j].Y) - R[j].Set(&rr) - } -} - -// batch inversion -// similar to BatchInvertfp.Element, ignores edge cases -func batchInvertG1Affine(res, a []fp.Element) { - - var accumulator fp.Element - accumulator.SetOne() - - for i := 0; i < len(res); i++ { - res[i] = accumulator - accumulator.Mul(&accumulator, &a[i]) - } - - accumulator.Inverse(&accumulator) - - for i := len(res) - 1; i >= 0; i-- { - res[i].Mul(&res[i], &accumulator) - accumulator.Mul(&accumulator, &a[i]) + rr.Y.Sub(&rr.Y, &(*R)[j].Y) + (*R)[j].Set(&rr) } } diff --git a/ecc/bn254/g2.go b/ecc/bn254/g2.go index 6437e4542e..09011b0c53 100644 --- a/ecc/bn254/g2.go +++ b/ecc/bn254/g2.go @@ -984,20 +984,31 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // batch add affine coordinates // using batch inversion // special cases (doubling, infinity) must be filtered out before this call -func batchAddG2Affine(R []*G2Affine, P []G2Affine) { - batchSize := len(R) - if batchSize == 0 { - return - } - var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2 +func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, batchSize int) { + var lambda, lambdain TC // add part for j := 0; j < batchSize; j++ { - lambdain[j].Sub(&P[j].X, &R[j].X) + lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X) } - // invert denominator - batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) + // invert denominator using montgomery batch invert technique + { + var accumulator fptower.E2 + accumulator.SetOne() + + for i := 0; i < batchSize; i++ { + lambda[i] = accumulator + accumulator.Mul(&accumulator, &lambdain[i]) + } + + accumulator.Inverse(&accumulator) + + for i := batchSize - 1; i >= 0; i-- { + lambda[i].Mul(&lambda[i], &accumulator) + accumulator.Mul(&accumulator, &lambdain[i]) + } + } var d fptower.E2 var rr G2Affine @@ -1005,36 +1016,16 @@ func batchAddG2Affine(R []*G2Affine, P []G2Affine) { // add part for j := 0; j < batchSize; j++ { // computa lambda - d.Sub(&P[j].Y, &R[j].Y) + d.Sub(&(*P)[j].Y, &(*R)[j].Y) lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[j].X) - rr.X.Sub(&rr.X, &P[j].X) - d.Sub(&R[j].X, &rr.X) + rr.X.Sub(&rr.X, &(*R)[j].X) + rr.X.Sub(&rr.X, &(*P)[j].X) + d.Sub(&(*R)[j].X, &rr.X) rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[j].Y) - R[j].Set(&rr) - } -} - -// batch inversion -// similar to BatchInvertfptower.E2, ignores edge cases -func batchInvertG2Affine(res, a []fptower.E2) { - - var accumulator fptower.E2 - accumulator.SetOne() - - for i := 0; i < len(res); i++ { - res[i] = accumulator - accumulator.Mul(&accumulator, &a[i]) - } - - accumulator.Inverse(&accumulator) - - for i := len(res) - 1; i >= 0; i-- { - res[i].Mul(&res[i], &accumulator) - accumulator.Mul(&accumulator, &a[i]) + rr.Y.Sub(&rr.Y, &(*R)[j].Y) + (*R)[j].Set(&rr) } } diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index 75e8a96061..6989e58d4b 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -174,31 +174,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10] + processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11] + processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12] + processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13] + processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14] + processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15] + processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -422,31 +422,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10] + processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11] + processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12] + processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13] + processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14] + processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15] + processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index 1f6ba85280..8eeded8aa1 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -16,15 +16,18 @@ package bn254 -const MAX_BATCH_SIZE = 600 +import ( + "github.com/consensys/gnark-crypto/ecc/bn254/fp" + "github.com/consensys/gnark-crypto/ecc/bn254/internal/fptower" +) -type batchOp struct { - pointID uint32 +type batchOpG1Affine struct { bucketID uint16 + point G1Affine } -func (o batchOp) isNeg() bool { - return o.pointID&1 == 1 +func (o batchOpG1Affine) isNeg() bool { + return o.bucketID&1 == 1 } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -33,7 +36,8 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, +func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( + chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -47,22 +51,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // setup for the batch affine; // we do that instead of a separate object to give enough hints to the compiler to.. - // keep things on the stack. - batchSize := len(buckets) / 20 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - // bucket references - var R [MAX_BATCH_SIZE]*G1Affine + var R TPP // bucket references + var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - var P [MAX_BATCH_SIZE]G1Affine + batchSize := len(P) canAdd := func(bID uint16) bool { return !bucketIds[bID] @@ -76,76 +71,98 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, if (cptAdd) == 0 { return } - batchAddG1Affine(R[:cptAdd], P[:cptAdd]) + batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) var tmp BS bucketIds = tmp cptAdd = 0 } - add := func(op batchOp) { + addFromQueue := func(op batchOpG1Affine) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] - PP := &points[op.pointID>>1] - if PP.IsInfinity() { + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() return } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } + + add := func(bucketID uint16, PP *G1Affine, isAdd bool) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(PP) - } else { + if isAdd { BK.Set(PP) + } else { + BK.Neg(PP) } return } if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { - if op.isNeg() { - // P + -P - BK.setInfinity() - return - } - // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // P + P: doubling, which should be quite rare -- // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. // need doubling in affine implemented ? - BK.Add(BK, BK) + if isAdd { + BK.Add(BK, BK) + } else { + BK.setInfinity() + } + return } - // b.Y == -p.Y - if op.isNeg() { - // doubling . + if isAdd { + BK.setInfinity() + } else { BK.Add(BK, BK) - return } - BK.setInfinity() return } - bucketIds[op.bucketID] = true + bucketIds[bucketID] = true R[cptAdd] = BK - if op.isNeg() { - P[cptAdd].Neg(PP) - } else { + if isAdd { P[cptAdd].Set(PP) + } else { + P[cptAdd].Neg(PP) } cptAdd++ } - var queue [MAX_BATCH_SIZE]batchOp + var queue TQ qID := 0 processQueue := func() { for i := qID - 1; i >= 0; i-- { - if canAdd(queue[i].bucketID) { - add(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] - qID-- + if !canAdd(queue[i].bucketID) { + continue } + addFromQueue(queue[i]) + if isFull() { + executeAndReset() + } + queue[i] = queue[qID-1] + qID-- } } @@ -154,7 +171,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, if !canAdd(queue[i].bucketID) { return } - add(queue[i]) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -164,40 +181,47 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, for i, digit := range digits { - if digit == 0 { + if digit == 0 || points[i].IsInfinity() { continue } - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if digit&1 == 0 { + bucketID := uint16((digit >> 1)) + isAdd := digit&1 == 0 + if isAdd { // add - op.bucketID = uint16((digit >> 1) - 1) - } else { - // sub - op.bucketID = (uint16((digit >> 1))) - op.pointID += 1 + bucketID -= 1 } - if canAdd(op.bucketID) { - add(op) - if isFull() { - executeAndReset() - processTopQueue() + + if !canAdd(bucketID) { + // put it in queue + queue[qID].bucketID = bucketID + if isAdd { + queue[qID].point = points[i] + } else { + queue[qID].point.Neg(&points[i]) } - } else { - // put it in queue. - queue[qID] = op qID++ - if qID == MAX_BATCH_SIZE-1 { + + // queue is full, flush it. + if qID == len(queue)-1 { executeAndReset() processQueue() } + continue + } + + // we add the point to the batch. + add(bucketID, &points[i], isAdd) + if isFull() { + executeAndReset() + processTopQueue() } } + // empty the queue for qID != 0 { processQueue() - executeAndReset() // execute batch even if not full. + executeAndReset() } // flush items in batch. @@ -222,12 +246,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG1AffineC4 [1 << (4 - 1)]G1Affine -type bucketG1AffineC5 [1 << (5 - 1)]G1Affine -type bucketG1AffineC6 [1 << (6 - 1)]G1Affine -type bucketG1AffineC7 [1 << (7 - 1)]G1Affine -type bucketG1AffineC8 [1 << (8 - 1)]G1Affine -type bucketG1AffineC9 [1 << (9 - 1)]G1Affine type bucketG1AffineC10 [1 << (10 - 1)]G1Affine type bucketG1AffineC11 [1 << (11 - 1)]G1Affine type bucketG1AffineC12 [1 << (12 - 1)]G1Affine @@ -236,14 +254,9 @@ type bucketG1AffineC14 [1 << (14 - 1)]G1Affine type bucketG1AffineC15 [1 << (15 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +// buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { - bucketG1AffineC4 | - bucketG1AffineC5 | - bucketG1AffineC6 | - bucketG1AffineC7 | - bucketG1AffineC8 | - bucketG1AffineC9 | - bucketG1AffineC10 | + bucketG1AffineC10 | bucketG1AffineC11 | bucketG1AffineC12 | bucketG1AffineC13 | @@ -252,13 +265,95 @@ type ibG1Affine interface { bucketG1AffineC16 } +// array of coordinates fp.Element +type cG1Affine interface { + cG1AffineC10 | + cG1AffineC11 | + cG1AffineC12 | + cG1AffineC13 | + cG1AffineC14 | + cG1AffineC15 | + cG1AffineC16 +} + +// buckets: array of G1Affine points (for the batch addition) +type pG1Affine interface { + pG1AffineC10 | + pG1AffineC11 | + pG1AffineC12 | + pG1AffineC13 | + pG1AffineC14 | + pG1AffineC15 | + pG1AffineC16 +} + +// buckets: array of *G1Affine points (for the batch addition) +type ppG1Affine interface { + ppG1AffineC10 | + ppG1AffineC11 | + ppG1AffineC12 | + ppG1AffineC13 | + ppG1AffineC14 | + ppG1AffineC15 | + ppG1AffineC16 +} + +// buckets: array of G1Affine queue operations (for the batch addition) +type qOpsG1Affine interface { + qOpsG1AffineC10 | + qOpsG1AffineC11 | + qOpsG1AffineC12 | + qOpsG1AffineC13 | + qOpsG1AffineC14 | + qOpsG1AffineC15 | + qOpsG1AffineC16 +} +type cG1AffineC10 [80]fp.Element +type pG1AffineC10 [80]G1Affine +type ppG1AffineC10 [80]*G1Affine +type qOpsG1AffineC10 [80]batchOpG1Affine +type cG1AffineC11 [150]fp.Element +type pG1AffineC11 [150]G1Affine +type ppG1AffineC11 [150]*G1Affine +type qOpsG1AffineC11 [150]batchOpG1Affine +type cG1AffineC12 [200]fp.Element +type pG1AffineC12 [200]G1Affine +type ppG1AffineC12 [200]*G1Affine +type qOpsG1AffineC12 [200]batchOpG1Affine +type cG1AffineC13 [350]fp.Element +type pG1AffineC13 [350]G1Affine +type ppG1AffineC13 [350]*G1Affine +type qOpsG1AffineC13 [350]batchOpG1Affine +type cG1AffineC14 [400]fp.Element +type pG1AffineC14 [400]G1Affine +type ppG1AffineC14 [400]*G1Affine +type qOpsG1AffineC14 [400]batchOpG1Affine +type cG1AffineC15 [500]fp.Element +type pG1AffineC15 [500]G1Affine +type ppG1AffineC15 [500]*G1Affine +type qOpsG1AffineC15 [500]batchOpG1Affine +type cG1AffineC16 [640]fp.Element +type pG1AffineC16 [640]G1Affine +type ppG1AffineC16 [640]*G1Affine +type qOpsG1AffineC16 [640]batchOpG1Affine + +type batchOpG2Affine struct { + bucketID uint16 + point G2Affine +} + +func (o batchOpG2Affine) isNeg() bool { + return o.bucketID&1 == 1 +} + // processChunkG2BatchAffine process a chunk of the scalars during the msm // using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition // we use a batch affine addition. // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, +func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( + chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -272,22 +367,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // setup for the batch affine; // we do that instead of a separate object to give enough hints to the compiler to.. - // keep things on the stack. - batchSize := len(buckets) / 20 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - // bucket references - var R [MAX_BATCH_SIZE]*G2Affine + var R TPP // bucket references + var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - var P [MAX_BATCH_SIZE]G2Affine + batchSize := len(P) canAdd := func(bID uint16) bool { return !bucketIds[bID] @@ -301,76 +387,98 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, if (cptAdd) == 0 { return } - batchAddG2Affine(R[:cptAdd], P[:cptAdd]) + batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) var tmp BS bucketIds = tmp cptAdd = 0 } - add := func(op batchOp) { + addFromQueue := func(op batchOpG2Affine) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] - PP := &points[op.pointID>>1] - if PP.IsInfinity() { + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() return } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } + + add := func(bucketID uint16, PP *G2Affine, isAdd bool) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(PP) - } else { + if isAdd { BK.Set(PP) + } else { + BK.Neg(PP) } return } if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { - if op.isNeg() { - // P + -P - BK.setInfinity() - return - } - // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // P + P: doubling, which should be quite rare -- // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. // need doubling in affine implemented ? - BK.Add(BK, BK) + if isAdd { + BK.Add(BK, BK) + } else { + BK.setInfinity() + } + return } - // b.Y == -p.Y - if op.isNeg() { - // doubling . + if isAdd { + BK.setInfinity() + } else { BK.Add(BK, BK) - return } - BK.setInfinity() return } - bucketIds[op.bucketID] = true + bucketIds[bucketID] = true R[cptAdd] = BK - if op.isNeg() { - P[cptAdd].Neg(PP) - } else { + if isAdd { P[cptAdd].Set(PP) + } else { + P[cptAdd].Neg(PP) } cptAdd++ } - var queue [MAX_BATCH_SIZE]batchOp + var queue TQ qID := 0 processQueue := func() { for i := qID - 1; i >= 0; i-- { - if canAdd(queue[i].bucketID) { - add(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] - qID-- + if !canAdd(queue[i].bucketID) { + continue + } + addFromQueue(queue[i]) + if isFull() { + executeAndReset() } + queue[i] = queue[qID-1] + qID-- } } @@ -379,7 +487,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, if !canAdd(queue[i].bucketID) { return } - add(queue[i]) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -389,40 +497,47 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, for i, digit := range digits { - if digit == 0 { + if digit == 0 || points[i].IsInfinity() { continue } - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if digit&1 == 0 { + bucketID := uint16((digit >> 1)) + isAdd := digit&1 == 0 + if isAdd { // add - op.bucketID = uint16((digit >> 1) - 1) - } else { - // sub - op.bucketID = (uint16((digit >> 1))) - op.pointID += 1 + bucketID -= 1 } - if canAdd(op.bucketID) { - add(op) - if isFull() { - executeAndReset() - processTopQueue() + + if !canAdd(bucketID) { + // put it in queue + queue[qID].bucketID = bucketID + if isAdd { + queue[qID].point = points[i] + } else { + queue[qID].point.Neg(&points[i]) } - } else { - // put it in queue. - queue[qID] = op qID++ - if qID == MAX_BATCH_SIZE-1 { + + // queue is full, flush it. + if qID == len(queue)-1 { executeAndReset() processQueue() } + continue + } + + // we add the point to the batch. + add(bucketID, &points[i], isAdd) + if isFull() { + executeAndReset() + processTopQueue() } } + // empty the queue for qID != 0 { processQueue() - executeAndReset() // execute batch even if not full. + executeAndReset() } // flush items in batch. @@ -447,12 +562,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG2AffineC4 [1 << (4 - 1)]G2Affine -type bucketG2AffineC5 [1 << (5 - 1)]G2Affine -type bucketG2AffineC6 [1 << (6 - 1)]G2Affine -type bucketG2AffineC7 [1 << (7 - 1)]G2Affine -type bucketG2AffineC8 [1 << (8 - 1)]G2Affine -type bucketG2AffineC9 [1 << (9 - 1)]G2Affine type bucketG2AffineC10 [1 << (10 - 1)]G2Affine type bucketG2AffineC11 [1 << (11 - 1)]G2Affine type bucketG2AffineC12 [1 << (12 - 1)]G2Affine @@ -461,14 +570,9 @@ type bucketG2AffineC14 [1 << (14 - 1)]G2Affine type bucketG2AffineC15 [1 << (15 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +// buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { - bucketG2AffineC4 | - bucketG2AffineC5 | - bucketG2AffineC6 | - bucketG2AffineC7 | - bucketG2AffineC8 | - bucketG2AffineC9 | - bucketG2AffineC10 | + bucketG2AffineC10 | bucketG2AffineC11 | bucketG2AffineC12 | bucketG2AffineC13 | @@ -477,6 +581,78 @@ type ibG2Affine interface { bucketG2AffineC16 } +// array of coordinates fptower.E2 +type cG2Affine interface { + cG2AffineC10 | + cG2AffineC11 | + cG2AffineC12 | + cG2AffineC13 | + cG2AffineC14 | + cG2AffineC15 | + cG2AffineC16 +} + +// buckets: array of G2Affine points (for the batch addition) +type pG2Affine interface { + pG2AffineC10 | + pG2AffineC11 | + pG2AffineC12 | + pG2AffineC13 | + pG2AffineC14 | + pG2AffineC15 | + pG2AffineC16 +} + +// buckets: array of *G2Affine points (for the batch addition) +type ppG2Affine interface { + ppG2AffineC10 | + ppG2AffineC11 | + ppG2AffineC12 | + ppG2AffineC13 | + ppG2AffineC14 | + ppG2AffineC15 | + ppG2AffineC16 +} + +// buckets: array of G2Affine queue operations (for the batch addition) +type qOpsG2Affine interface { + qOpsG2AffineC10 | + qOpsG2AffineC11 | + qOpsG2AffineC12 | + qOpsG2AffineC13 | + qOpsG2AffineC14 | + qOpsG2AffineC15 | + qOpsG2AffineC16 +} +type cG2AffineC10 [80]fptower.E2 +type pG2AffineC10 [80]G2Affine +type ppG2AffineC10 [80]*G2Affine +type qOpsG2AffineC10 [80]batchOpG2Affine +type cG2AffineC11 [150]fptower.E2 +type pG2AffineC11 [150]G2Affine +type ppG2AffineC11 [150]*G2Affine +type qOpsG2AffineC11 [150]batchOpG2Affine +type cG2AffineC12 [200]fptower.E2 +type pG2AffineC12 [200]G2Affine +type ppG2AffineC12 [200]*G2Affine +type qOpsG2AffineC12 [200]batchOpG2Affine +type cG2AffineC13 [350]fptower.E2 +type pG2AffineC13 [350]G2Affine +type ppG2AffineC13 [350]*G2Affine +type qOpsG2AffineC13 [350]batchOpG2Affine +type cG2AffineC14 [400]fptower.E2 +type pG2AffineC14 [400]G2Affine +type ppG2AffineC14 [400]*G2Affine +type qOpsG2AffineC14 [400]batchOpG2Affine +type cG2AffineC15 [500]fptower.E2 +type pG2AffineC15 [500]G2Affine +type ppG2AffineC15 [500]*G2Affine +type qOpsG2AffineC15 [500]batchOpG2Affine +type cG2AffineC16 [640]fptower.E2 +type pG2AffineC16 [640]G2Affine +type ppG2AffineC16 [640]*G2Affine +type qOpsG2AffineC16 [640]batchOpG2Affine + type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool type bitSetC6 [1 << (6 - 1)]bool diff --git a/ecc/bw6-633/g1.go b/ecc/bw6-633/g1.go index 707fbefb6f..dc2289ac76 100644 --- a/ecc/bw6-633/g1.go +++ b/ecc/bw6-633/g1.go @@ -1087,20 +1087,31 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // batch add affine coordinates // using batch inversion // special cases (doubling, infinity) must be filtered out before this call -func batchAddG1Affine(R []*G1Affine, P []G1Affine) { - batchSize := len(R) - if batchSize == 0 { - return - } - var lambda, lambdain [MAX_BATCH_SIZE]fp.Element +func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, batchSize int) { + var lambda, lambdain TC // add part for j := 0; j < batchSize; j++ { - lambdain[j].Sub(&P[j].X, &R[j].X) + lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X) } - // invert denominator - batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) + // invert denominator using montgomery batch invert technique + { + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < batchSize; i++ { + lambda[i] = accumulator + accumulator.Mul(&accumulator, &lambdain[i]) + } + + accumulator.Inverse(&accumulator) + + for i := batchSize - 1; i >= 0; i-- { + lambda[i].Mul(&lambda[i], &accumulator) + accumulator.Mul(&accumulator, &lambdain[i]) + } + } var d fp.Element var rr G1Affine @@ -1108,36 +1119,16 @@ func batchAddG1Affine(R []*G1Affine, P []G1Affine) { // add part for j := 0; j < batchSize; j++ { // computa lambda - d.Sub(&P[j].Y, &R[j].Y) + d.Sub(&(*P)[j].Y, &(*R)[j].Y) lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[j].X) - rr.X.Sub(&rr.X, &P[j].X) - d.Sub(&R[j].X, &rr.X) + rr.X.Sub(&rr.X, &(*R)[j].X) + rr.X.Sub(&rr.X, &(*P)[j].X) + d.Sub(&(*R)[j].X, &rr.X) rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[j].Y) - R[j].Set(&rr) - } -} - -// batch inversion -// similar to BatchInvertfp.Element, ignores edge cases -func batchInvertG1Affine(res, a []fp.Element) { - - var accumulator fp.Element - accumulator.SetOne() - - for i := 0; i < len(res); i++ { - res[i] = accumulator - accumulator.Mul(&accumulator, &a[i]) - } - - accumulator.Inverse(&accumulator) - - for i := len(res) - 1; i >= 0; i-- { - res[i].Mul(&res[i], &accumulator) - accumulator.Mul(&accumulator, &a[i]) + rr.Y.Sub(&rr.Y, &(*R)[j].Y) + (*R)[j].Set(&rr) } } diff --git a/ecc/bw6-633/g2.go b/ecc/bw6-633/g2.go index 69e4b4263c..3d27026424 100644 --- a/ecc/bw6-633/g2.go +++ b/ecc/bw6-633/g2.go @@ -950,20 +950,31 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // batch add affine coordinates // using batch inversion // special cases (doubling, infinity) must be filtered out before this call -func batchAddG2Affine(R []*G2Affine, P []G2Affine) { - batchSize := len(R) - if batchSize == 0 { - return - } - var lambda, lambdain [MAX_BATCH_SIZE]fp.Element +func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, batchSize int) { + var lambda, lambdain TC // add part for j := 0; j < batchSize; j++ { - lambdain[j].Sub(&P[j].X, &R[j].X) + lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X) } - // invert denominator - batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) + // invert denominator using montgomery batch invert technique + { + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < batchSize; i++ { + lambda[i] = accumulator + accumulator.Mul(&accumulator, &lambdain[i]) + } + + accumulator.Inverse(&accumulator) + + for i := batchSize - 1; i >= 0; i-- { + lambda[i].Mul(&lambda[i], &accumulator) + accumulator.Mul(&accumulator, &lambdain[i]) + } + } var d fp.Element var rr G2Affine @@ -971,36 +982,16 @@ func batchAddG2Affine(R []*G2Affine, P []G2Affine) { // add part for j := 0; j < batchSize; j++ { // computa lambda - d.Sub(&P[j].Y, &R[j].Y) + d.Sub(&(*P)[j].Y, &(*R)[j].Y) lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[j].X) - rr.X.Sub(&rr.X, &P[j].X) - d.Sub(&R[j].X, &rr.X) + rr.X.Sub(&rr.X, &(*R)[j].X) + rr.X.Sub(&rr.X, &(*P)[j].X) + d.Sub(&(*R)[j].X, &rr.X) rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[j].Y) - R[j].Set(&rr) - } -} - -// batch inversion -// similar to BatchInvertfp.Element, ignores edge cases -func batchInvertG2Affine(res, a []fp.Element) { - - var accumulator fp.Element - accumulator.SetOne() - - for i := 0; i < len(res); i++ { - res[i] = accumulator - accumulator.Mul(&accumulator, &a[i]) - } - - accumulator.Inverse(&accumulator) - - for i := len(res) - 1; i >= 0; i-- { - res[i].Mul(&res[i], &accumulator) - accumulator.Mul(&accumulator, &a[i]) + rr.Y.Sub(&rr.Y, &(*R)[j].Y) + (*R)[j].Set(&rr) } } diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index fb6367cb6b..d329dacf85 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -161,7 +161,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -372,7 +372,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index 38c1973d0e..fd164fefd2 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -16,15 +16,18 @@ package bw6633 -const MAX_BATCH_SIZE = 600 +import ( + "github.com/consensys/gnark-crypto/ecc/bw6-633/fp" + "github.com/consensys/gnark-crypto/ecc/bw6-633/internal/fptower" +) -type batchOp struct { - pointID uint32 +type batchOpG1Affine struct { bucketID uint16 + point G1Affine } -func (o batchOp) isNeg() bool { - return o.pointID&1 == 1 +func (o batchOpG1Affine) isNeg() bool { + return o.bucketID&1 == 1 } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -33,7 +36,8 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, +func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( + chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -47,22 +51,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // setup for the batch affine; // we do that instead of a separate object to give enough hints to the compiler to.. - // keep things on the stack. - batchSize := len(buckets) / 20 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - // bucket references - var R [MAX_BATCH_SIZE]*G1Affine + var R TPP // bucket references + var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - var P [MAX_BATCH_SIZE]G1Affine + batchSize := len(P) canAdd := func(bID uint16) bool { return !bucketIds[bID] @@ -76,76 +71,98 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, if (cptAdd) == 0 { return } - batchAddG1Affine(R[:cptAdd], P[:cptAdd]) + batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) var tmp BS bucketIds = tmp cptAdd = 0 } - add := func(op batchOp) { + addFromQueue := func(op batchOpG1Affine) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] - PP := &points[op.pointID>>1] - if PP.IsInfinity() { + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) return } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } + + add := func(bucketID uint16, PP *G1Affine, isAdd bool) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(PP) - } else { + if isAdd { BK.Set(PP) + } else { + BK.Neg(PP) } return } if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { - if op.isNeg() { - // P + -P - BK.setInfinity() - return - } - // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // P + P: doubling, which should be quite rare -- // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. // need doubling in affine implemented ? - BK.Add(BK, BK) + if isAdd { + BK.Add(BK, BK) + } else { + BK.setInfinity() + } + return } - // b.Y == -p.Y - if op.isNeg() { - // doubling . + if isAdd { + BK.setInfinity() + } else { BK.Add(BK, BK) - return } - BK.setInfinity() return } - bucketIds[op.bucketID] = true + bucketIds[bucketID] = true R[cptAdd] = BK - if op.isNeg() { - P[cptAdd].Neg(PP) - } else { + if isAdd { P[cptAdd].Set(PP) + } else { + P[cptAdd].Neg(PP) } cptAdd++ } - var queue [MAX_BATCH_SIZE]batchOp + var queue TQ qID := 0 processQueue := func() { for i := qID - 1; i >= 0; i-- { - if canAdd(queue[i].bucketID) { - add(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] - qID-- + if !canAdd(queue[i].bucketID) { + continue + } + addFromQueue(queue[i]) + if isFull() { + executeAndReset() } + queue[i] = queue[qID-1] + qID-- } } @@ -154,7 +171,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, if !canAdd(queue[i].bucketID) { return } - add(queue[i]) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -164,40 +181,47 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, for i, digit := range digits { - if digit == 0 { + if digit == 0 || points[i].IsInfinity() { continue } - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if digit&1 == 0 { + bucketID := uint16((digit >> 1)) + isAdd := digit&1 == 0 + if isAdd { // add - op.bucketID = uint16((digit >> 1) - 1) - } else { - // sub - op.bucketID = (uint16((digit >> 1))) - op.pointID += 1 + bucketID -= 1 } - if canAdd(op.bucketID) { - add(op) - if isFull() { - executeAndReset() - processTopQueue() + + if !canAdd(bucketID) { + // put it in queue + queue[qID].bucketID = bucketID + if isAdd { + queue[qID].point = points[i] + } else { + queue[qID].point.Neg(&points[i]) } - } else { - // put it in queue. - queue[qID] = op qID++ - if qID == MAX_BATCH_SIZE-1 { + + // queue is full, flush it. + if qID == len(queue)-1 { executeAndReset() processQueue() } + continue + } + + // we add the point to the batch. + add(bucketID, &points[i], isAdd) + if isFull() { + executeAndReset() + processTopQueue() } } + // empty the queue for qID != 0 { processQueue() - executeAndReset() // execute batch even if not full. + executeAndReset() } // flush items in batch. @@ -222,16 +246,44 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG1AffineC4 [1 << (4 - 1)]G1Affine -type bucketG1AffineC5 [1 << (5 - 1)]G1Affine -type bucketG1AffineC8 [1 << (8 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +// buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { - bucketG1AffineC4 | - bucketG1AffineC5 | - bucketG1AffineC8 | - bucketG1AffineC16 + bucketG1AffineC16 +} + +// array of coordinates fp.Element +type cG1Affine interface { + cG1AffineC16 +} + +// buckets: array of G1Affine points (for the batch addition) +type pG1Affine interface { + pG1AffineC16 +} + +// buckets: array of *G1Affine points (for the batch addition) +type ppG1Affine interface { + ppG1AffineC16 +} + +// buckets: array of G1Affine queue operations (for the batch addition) +type qOpsG1Affine interface { + qOpsG1AffineC16 +} +type cG1AffineC16 [640]fp.Element +type pG1AffineC16 [640]G1Affine +type ppG1AffineC16 [640]*G1Affine +type qOpsG1AffineC16 [640]batchOpG1Affine + +type batchOpG2Affine struct { + bucketID uint16 + point G2Affine +} + +func (o batchOpG2Affine) isNeg() bool { + return o.bucketID&1 == 1 } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -240,7 +292,8 @@ type ibG1Affine interface { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, +func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( + chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -254,22 +307,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // setup for the batch affine; // we do that instead of a separate object to give enough hints to the compiler to.. - // keep things on the stack. - batchSize := len(buckets) / 20 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - // bucket references - var R [MAX_BATCH_SIZE]*G2Affine + var R TPP // bucket references + var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - var P [MAX_BATCH_SIZE]G2Affine + batchSize := len(P) canAdd := func(bID uint16) bool { return !bucketIds[bID] @@ -283,76 +327,98 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, if (cptAdd) == 0 { return } - batchAddG2Affine(R[:cptAdd], P[:cptAdd]) + batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) var tmp BS bucketIds = tmp cptAdd = 0 } - add := func(op batchOp) { + addFromQueue := func(op batchOpG2Affine) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] - PP := &points[op.pointID>>1] - if PP.IsInfinity() { + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() return } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } + + add := func(bucketID uint16, PP *G2Affine, isAdd bool) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(PP) - } else { + if isAdd { BK.Set(PP) + } else { + BK.Neg(PP) } return } if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { - if op.isNeg() { - // P + -P - BK.setInfinity() - return - } - // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // P + P: doubling, which should be quite rare -- // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. // need doubling in affine implemented ? - BK.Add(BK, BK) + if isAdd { + BK.Add(BK, BK) + } else { + BK.setInfinity() + } + return } - // b.Y == -p.Y - if op.isNeg() { - // doubling . + if isAdd { + BK.setInfinity() + } else { BK.Add(BK, BK) - return } - BK.setInfinity() return } - bucketIds[op.bucketID] = true + bucketIds[bucketID] = true R[cptAdd] = BK - if op.isNeg() { - P[cptAdd].Neg(PP) - } else { + if isAdd { P[cptAdd].Set(PP) + } else { + P[cptAdd].Neg(PP) } cptAdd++ } - var queue [MAX_BATCH_SIZE]batchOp + var queue TQ qID := 0 processQueue := func() { for i := qID - 1; i >= 0; i-- { - if canAdd(queue[i].bucketID) { - add(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] - qID-- + if !canAdd(queue[i].bucketID) { + continue + } + addFromQueue(queue[i]) + if isFull() { + executeAndReset() } + queue[i] = queue[qID-1] + qID-- } } @@ -361,7 +427,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, if !canAdd(queue[i].bucketID) { return } - add(queue[i]) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -371,40 +437,47 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, for i, digit := range digits { - if digit == 0 { + if digit == 0 || points[i].IsInfinity() { continue } - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if digit&1 == 0 { + bucketID := uint16((digit >> 1)) + isAdd := digit&1 == 0 + if isAdd { // add - op.bucketID = uint16((digit >> 1) - 1) - } else { - // sub - op.bucketID = (uint16((digit >> 1))) - op.pointID += 1 + bucketID -= 1 } - if canAdd(op.bucketID) { - add(op) - if isFull() { - executeAndReset() - processTopQueue() + + if !canAdd(bucketID) { + // put it in queue + queue[qID].bucketID = bucketID + if isAdd { + queue[qID].point = points[i] + } else { + queue[qID].point.Neg(&points[i]) } - } else { - // put it in queue. - queue[qID] = op qID++ - if qID == MAX_BATCH_SIZE-1 { + + // queue is full, flush it. + if qID == len(queue)-1 { executeAndReset() processQueue() } + continue + } + + // we add the point to the batch. + add(bucketID, &points[i], isAdd) + if isFull() { + executeAndReset() + processTopQueue() } } + // empty the queue for qID != 0 { processQueue() - executeAndReset() // execute batch even if not full. + executeAndReset() } // flush items in batch. @@ -429,17 +502,36 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG2AffineC4 [1 << (4 - 1)]G2Affine -type bucketG2AffineC5 [1 << (5 - 1)]G2Affine -type bucketG2AffineC8 [1 << (8 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +// buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { - bucketG2AffineC4 | - bucketG2AffineC5 | - bucketG2AffineC8 | - bucketG2AffineC16 + bucketG2AffineC16 +} + +// array of coordinates fp.Element +type cG2Affine interface { + cG2AffineC16 +} + +// buckets: array of G2Affine points (for the batch addition) +type pG2Affine interface { + pG2AffineC16 +} + +// buckets: array of *G2Affine points (for the batch addition) +type ppG2Affine interface { + ppG2AffineC16 +} + +// buckets: array of G2Affine queue operations (for the batch addition) +type qOpsG2Affine interface { + qOpsG2AffineC16 } +type cG2AffineC16 [640]fp.Element +type pG2AffineC16 [640]G2Affine +type ppG2AffineC16 [640]*G2Affine +type qOpsG2AffineC16 [640]batchOpG2Affine type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool diff --git a/ecc/bw6-756/g1.go b/ecc/bw6-756/g1.go index 395f0a7f84..5cbf001665 100644 --- a/ecc/bw6-756/g1.go +++ b/ecc/bw6-756/g1.go @@ -1087,20 +1087,31 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // batch add affine coordinates // using batch inversion // special cases (doubling, infinity) must be filtered out before this call -func batchAddG1Affine(R []*G1Affine, P []G1Affine) { - batchSize := len(R) - if batchSize == 0 { - return - } - var lambda, lambdain [MAX_BATCH_SIZE]fp.Element +func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, batchSize int) { + var lambda, lambdain TC // add part for j := 0; j < batchSize; j++ { - lambdain[j].Sub(&P[j].X, &R[j].X) + lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X) } - // invert denominator - batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) + // invert denominator using montgomery batch invert technique + { + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < batchSize; i++ { + lambda[i] = accumulator + accumulator.Mul(&accumulator, &lambdain[i]) + } + + accumulator.Inverse(&accumulator) + + for i := batchSize - 1; i >= 0; i-- { + lambda[i].Mul(&lambda[i], &accumulator) + accumulator.Mul(&accumulator, &lambdain[i]) + } + } var d fp.Element var rr G1Affine @@ -1108,36 +1119,16 @@ func batchAddG1Affine(R []*G1Affine, P []G1Affine) { // add part for j := 0; j < batchSize; j++ { // computa lambda - d.Sub(&P[j].Y, &R[j].Y) + d.Sub(&(*P)[j].Y, &(*R)[j].Y) lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[j].X) - rr.X.Sub(&rr.X, &P[j].X) - d.Sub(&R[j].X, &rr.X) + rr.X.Sub(&rr.X, &(*R)[j].X) + rr.X.Sub(&rr.X, &(*P)[j].X) + d.Sub(&(*R)[j].X, &rr.X) rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[j].Y) - R[j].Set(&rr) - } -} - -// batch inversion -// similar to BatchInvertfp.Element, ignores edge cases -func batchInvertG1Affine(res, a []fp.Element) { - - var accumulator fp.Element - accumulator.SetOne() - - for i := 0; i < len(res); i++ { - res[i] = accumulator - accumulator.Mul(&accumulator, &a[i]) - } - - accumulator.Inverse(&accumulator) - - for i := len(res) - 1; i >= 0; i-- { - res[i].Mul(&res[i], &accumulator) - accumulator.Mul(&accumulator, &a[i]) + rr.Y.Sub(&rr.Y, &(*R)[j].Y) + (*R)[j].Set(&rr) } } diff --git a/ecc/bw6-756/g2.go b/ecc/bw6-756/g2.go index 63a4631e6d..e8b048fb9b 100644 --- a/ecc/bw6-756/g2.go +++ b/ecc/bw6-756/g2.go @@ -944,20 +944,31 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // batch add affine coordinates // using batch inversion // special cases (doubling, infinity) must be filtered out before this call -func batchAddG2Affine(R []*G2Affine, P []G2Affine) { - batchSize := len(R) - if batchSize == 0 { - return - } - var lambda, lambdain [MAX_BATCH_SIZE]fp.Element +func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, batchSize int) { + var lambda, lambdain TC // add part for j := 0; j < batchSize; j++ { - lambdain[j].Sub(&P[j].X, &R[j].X) + lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X) } - // invert denominator - batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) + // invert denominator using montgomery batch invert technique + { + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < batchSize; i++ { + lambda[i] = accumulator + accumulator.Mul(&accumulator, &lambdain[i]) + } + + accumulator.Inverse(&accumulator) + + for i := batchSize - 1; i >= 0; i-- { + lambda[i].Mul(&lambda[i], &accumulator) + accumulator.Mul(&accumulator, &lambdain[i]) + } + } var d fp.Element var rr G2Affine @@ -965,36 +976,16 @@ func batchAddG2Affine(R []*G2Affine, P []G2Affine) { // add part for j := 0; j < batchSize; j++ { // computa lambda - d.Sub(&P[j].Y, &R[j].Y) + d.Sub(&(*P)[j].Y, &(*R)[j].Y) lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[j].X) - rr.X.Sub(&rr.X, &P[j].X) - d.Sub(&R[j].X, &rr.X) + rr.X.Sub(&rr.X, &(*R)[j].X) + rr.X.Sub(&rr.X, &(*P)[j].X) + d.Sub(&(*R)[j].X, &rr.X) rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[j].Y) - R[j].Set(&rr) - } -} - -// batch inversion -// similar to BatchInvertfp.Element, ignores edge cases -func batchInvertG2Affine(res, a []fp.Element) { - - var accumulator fp.Element - accumulator.SetOne() - - for i := 0; i < len(res); i++ { - res[i] = accumulator - accumulator.Mul(&accumulator, &a[i]) - } - - accumulator.Inverse(&accumulator) - - for i := len(res) - 1; i >= 0; i-- { - res[i].Mul(&res[i], &accumulator) - accumulator.Mul(&accumulator, &a[i]) + rr.Y.Sub(&rr.Y, &(*R)[j].Y) + (*R)[j].Set(&rr) } } diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index 0ba6a6ed57..719bca28bf 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -162,7 +162,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -374,7 +374,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index c59b38e882..a9f8f12db5 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -16,15 +16,18 @@ package bw6756 -const MAX_BATCH_SIZE = 600 +import ( + "github.com/consensys/gnark-crypto/ecc/bw6-756/fp" + "github.com/consensys/gnark-crypto/ecc/bw6-756/internal/fptower" +) -type batchOp struct { - pointID uint32 +type batchOpG1Affine struct { bucketID uint16 + point G1Affine } -func (o batchOp) isNeg() bool { - return o.pointID&1 == 1 +func (o batchOpG1Affine) isNeg() bool { + return o.bucketID&1 == 1 } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -33,7 +36,8 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, +func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( + chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -47,22 +51,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // setup for the batch affine; // we do that instead of a separate object to give enough hints to the compiler to.. - // keep things on the stack. - batchSize := len(buckets) / 20 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - // bucket references - var R [MAX_BATCH_SIZE]*G1Affine + var R TPP // bucket references + var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - var P [MAX_BATCH_SIZE]G1Affine + batchSize := len(P) canAdd := func(bID uint16) bool { return !bucketIds[bID] @@ -76,76 +71,98 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, if (cptAdd) == 0 { return } - batchAddG1Affine(R[:cptAdd], P[:cptAdd]) + batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) var tmp BS bucketIds = tmp cptAdd = 0 } - add := func(op batchOp) { + addFromQueue := func(op batchOpG1Affine) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] - PP := &points[op.pointID>>1] - if PP.IsInfinity() { + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) return } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } + + add := func(bucketID uint16, PP *G1Affine, isAdd bool) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(PP) - } else { + if isAdd { BK.Set(PP) + } else { + BK.Neg(PP) } return } if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { - if op.isNeg() { - // P + -P - BK.setInfinity() - return - } - // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // P + P: doubling, which should be quite rare -- // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. // need doubling in affine implemented ? - BK.Add(BK, BK) + if isAdd { + BK.Add(BK, BK) + } else { + BK.setInfinity() + } + return } - // b.Y == -p.Y - if op.isNeg() { - // doubling . + if isAdd { + BK.setInfinity() + } else { BK.Add(BK, BK) - return } - BK.setInfinity() return } - bucketIds[op.bucketID] = true + bucketIds[bucketID] = true R[cptAdd] = BK - if op.isNeg() { - P[cptAdd].Neg(PP) - } else { + if isAdd { P[cptAdd].Set(PP) + } else { + P[cptAdd].Neg(PP) } cptAdd++ } - var queue [MAX_BATCH_SIZE]batchOp + var queue TQ qID := 0 processQueue := func() { for i := qID - 1; i >= 0; i-- { - if canAdd(queue[i].bucketID) { - add(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] - qID-- + if !canAdd(queue[i].bucketID) { + continue + } + addFromQueue(queue[i]) + if isFull() { + executeAndReset() } + queue[i] = queue[qID-1] + qID-- } } @@ -154,7 +171,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, if !canAdd(queue[i].bucketID) { return } - add(queue[i]) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -164,40 +181,47 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, for i, digit := range digits { - if digit == 0 { + if digit == 0 || points[i].IsInfinity() { continue } - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if digit&1 == 0 { + bucketID := uint16((digit >> 1)) + isAdd := digit&1 == 0 + if isAdd { // add - op.bucketID = uint16((digit >> 1) - 1) - } else { - // sub - op.bucketID = (uint16((digit >> 1))) - op.pointID += 1 + bucketID -= 1 } - if canAdd(op.bucketID) { - add(op) - if isFull() { - executeAndReset() - processTopQueue() + + if !canAdd(bucketID) { + // put it in queue + queue[qID].bucketID = bucketID + if isAdd { + queue[qID].point = points[i] + } else { + queue[qID].point.Neg(&points[i]) } - } else { - // put it in queue. - queue[qID] = op qID++ - if qID == MAX_BATCH_SIZE-1 { + + // queue is full, flush it. + if qID == len(queue)-1 { executeAndReset() processQueue() } + continue + } + + // we add the point to the batch. + add(bucketID, &points[i], isAdd) + if isFull() { + executeAndReset() + processTopQueue() } } + // empty the queue for qID != 0 { processQueue() - executeAndReset() // execute batch even if not full. + executeAndReset() } // flush items in batch. @@ -222,16 +246,44 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG1AffineC4 [1 << (4 - 1)]G1Affine -type bucketG1AffineC5 [1 << (5 - 1)]G1Affine -type bucketG1AffineC8 [1 << (8 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +// buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { - bucketG1AffineC4 | - bucketG1AffineC5 | - bucketG1AffineC8 | - bucketG1AffineC16 + bucketG1AffineC16 +} + +// array of coordinates fp.Element +type cG1Affine interface { + cG1AffineC16 +} + +// buckets: array of G1Affine points (for the batch addition) +type pG1Affine interface { + pG1AffineC16 +} + +// buckets: array of *G1Affine points (for the batch addition) +type ppG1Affine interface { + ppG1AffineC16 +} + +// buckets: array of G1Affine queue operations (for the batch addition) +type qOpsG1Affine interface { + qOpsG1AffineC16 +} +type cG1AffineC16 [640]fp.Element +type pG1AffineC16 [640]G1Affine +type ppG1AffineC16 [640]*G1Affine +type qOpsG1AffineC16 [640]batchOpG1Affine + +type batchOpG2Affine struct { + bucketID uint16 + point G2Affine +} + +func (o batchOpG2Affine) isNeg() bool { + return o.bucketID&1 == 1 } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -240,7 +292,8 @@ type ibG1Affine interface { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, +func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( + chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -254,22 +307,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // setup for the batch affine; // we do that instead of a separate object to give enough hints to the compiler to.. - // keep things on the stack. - batchSize := len(buckets) / 20 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - // bucket references - var R [MAX_BATCH_SIZE]*G2Affine + var R TPP // bucket references + var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - var P [MAX_BATCH_SIZE]G2Affine + batchSize := len(P) canAdd := func(bID uint16) bool { return !bucketIds[bID] @@ -283,76 +327,98 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, if (cptAdd) == 0 { return } - batchAddG2Affine(R[:cptAdd], P[:cptAdd]) + batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) var tmp BS bucketIds = tmp cptAdd = 0 } - add := func(op batchOp) { + addFromQueue := func(op batchOpG2Affine) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] - PP := &points[op.pointID>>1] - if PP.IsInfinity() { + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() return } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } + + add := func(bucketID uint16, PP *G2Affine, isAdd bool) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(PP) - } else { + if isAdd { BK.Set(PP) + } else { + BK.Neg(PP) } return } if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { - if op.isNeg() { - // P + -P - BK.setInfinity() - return - } - // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // P + P: doubling, which should be quite rare -- // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. // need doubling in affine implemented ? - BK.Add(BK, BK) + if isAdd { + BK.Add(BK, BK) + } else { + BK.setInfinity() + } + return } - // b.Y == -p.Y - if op.isNeg() { - // doubling . + if isAdd { + BK.setInfinity() + } else { BK.Add(BK, BK) - return } - BK.setInfinity() return } - bucketIds[op.bucketID] = true + bucketIds[bucketID] = true R[cptAdd] = BK - if op.isNeg() { - P[cptAdd].Neg(PP) - } else { + if isAdd { P[cptAdd].Set(PP) + } else { + P[cptAdd].Neg(PP) } cptAdd++ } - var queue [MAX_BATCH_SIZE]batchOp + var queue TQ qID := 0 processQueue := func() { for i := qID - 1; i >= 0; i-- { - if canAdd(queue[i].bucketID) { - add(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] - qID-- + if !canAdd(queue[i].bucketID) { + continue + } + addFromQueue(queue[i]) + if isFull() { + executeAndReset() } + queue[i] = queue[qID-1] + qID-- } } @@ -361,7 +427,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, if !canAdd(queue[i].bucketID) { return } - add(queue[i]) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -371,40 +437,47 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, for i, digit := range digits { - if digit == 0 { + if digit == 0 || points[i].IsInfinity() { continue } - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if digit&1 == 0 { + bucketID := uint16((digit >> 1)) + isAdd := digit&1 == 0 + if isAdd { // add - op.bucketID = uint16((digit >> 1) - 1) - } else { - // sub - op.bucketID = (uint16((digit >> 1))) - op.pointID += 1 + bucketID -= 1 } - if canAdd(op.bucketID) { - add(op) - if isFull() { - executeAndReset() - processTopQueue() + + if !canAdd(bucketID) { + // put it in queue + queue[qID].bucketID = bucketID + if isAdd { + queue[qID].point = points[i] + } else { + queue[qID].point.Neg(&points[i]) } - } else { - // put it in queue. - queue[qID] = op qID++ - if qID == MAX_BATCH_SIZE-1 { + + // queue is full, flush it. + if qID == len(queue)-1 { executeAndReset() processQueue() } + continue + } + + // we add the point to the batch. + add(bucketID, &points[i], isAdd) + if isFull() { + executeAndReset() + processTopQueue() } } + // empty the queue for qID != 0 { processQueue() - executeAndReset() // execute batch even if not full. + executeAndReset() } // flush items in batch. @@ -429,17 +502,36 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG2AffineC4 [1 << (4 - 1)]G2Affine -type bucketG2AffineC5 [1 << (5 - 1)]G2Affine -type bucketG2AffineC8 [1 << (8 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +// buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { - bucketG2AffineC4 | - bucketG2AffineC5 | - bucketG2AffineC8 | - bucketG2AffineC16 + bucketG2AffineC16 +} + +// array of coordinates fp.Element +type cG2Affine interface { + cG2AffineC16 +} + +// buckets: array of G2Affine points (for the batch addition) +type pG2Affine interface { + pG2AffineC16 +} + +// buckets: array of *G2Affine points (for the batch addition) +type ppG2Affine interface { + ppG2AffineC16 +} + +// buckets: array of G2Affine queue operations (for the batch addition) +type qOpsG2Affine interface { + qOpsG2AffineC16 } +type cG2AffineC16 [640]fp.Element +type pG2AffineC16 [640]G2Affine +type ppG2AffineC16 [640]*G2Affine +type qOpsG2AffineC16 [640]batchOpG2Affine type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool diff --git a/ecc/bw6-761/g1.go b/ecc/bw6-761/g1.go index 880371b042..d6de060519 100644 --- a/ecc/bw6-761/g1.go +++ b/ecc/bw6-761/g1.go @@ -1098,20 +1098,31 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin // batch add affine coordinates // using batch inversion // special cases (doubling, infinity) must be filtered out before this call -func batchAddG1Affine(R []*G1Affine, P []G1Affine) { - batchSize := len(R) - if batchSize == 0 { - return - } - var lambda, lambdain [MAX_BATCH_SIZE]fp.Element +func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, batchSize int) { + var lambda, lambdain TC // add part for j := 0; j < batchSize; j++ { - lambdain[j].Sub(&P[j].X, &R[j].X) + lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X) } - // invert denominator - batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize]) + // invert denominator using montgomery batch invert technique + { + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < batchSize; i++ { + lambda[i] = accumulator + accumulator.Mul(&accumulator, &lambdain[i]) + } + + accumulator.Inverse(&accumulator) + + for i := batchSize - 1; i >= 0; i-- { + lambda[i].Mul(&lambda[i], &accumulator) + accumulator.Mul(&accumulator, &lambdain[i]) + } + } var d fp.Element var rr G1Affine @@ -1119,36 +1130,16 @@ func batchAddG1Affine(R []*G1Affine, P []G1Affine) { // add part for j := 0; j < batchSize; j++ { // computa lambda - d.Sub(&P[j].Y, &R[j].Y) + d.Sub(&(*P)[j].Y, &(*R)[j].Y) lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[j].X) - rr.X.Sub(&rr.X, &P[j].X) - d.Sub(&R[j].X, &rr.X) + rr.X.Sub(&rr.X, &(*R)[j].X) + rr.X.Sub(&rr.X, &(*P)[j].X) + d.Sub(&(*R)[j].X, &rr.X) rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[j].Y) - R[j].Set(&rr) - } -} - -// batch inversion -// similar to BatchInvertfp.Element, ignores edge cases -func batchInvertG1Affine(res, a []fp.Element) { - - var accumulator fp.Element - accumulator.SetOne() - - for i := 0; i < len(res); i++ { - res[i] = accumulator - accumulator.Mul(&accumulator, &a[i]) - } - - accumulator.Inverse(&accumulator) - - for i := len(res) - 1; i >= 0; i-- { - res[i].Mul(&res[i], &accumulator) - accumulator.Mul(&accumulator, &a[i]) + rr.Y.Sub(&rr.Y, &(*R)[j].Y) + (*R)[j].Set(&rr) } } diff --git a/ecc/bw6-761/g2.go b/ecc/bw6-761/g2.go index 892cedad40..b1b8b664dd 100644 --- a/ecc/bw6-761/g2.go +++ b/ecc/bw6-761/g2.go @@ -958,20 +958,31 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin // batch add affine coordinates // using batch inversion // special cases (doubling, infinity) must be filtered out before this call -func batchAddG2Affine(R []*G2Affine, P []G2Affine) { - batchSize := len(R) - if batchSize == 0 { - return - } - var lambda, lambdain [MAX_BATCH_SIZE]fp.Element +func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, batchSize int) { + var lambda, lambdain TC // add part for j := 0; j < batchSize; j++ { - lambdain[j].Sub(&P[j].X, &R[j].X) + lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X) } - // invert denominator - batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize]) + // invert denominator using montgomery batch invert technique + { + var accumulator fp.Element + accumulator.SetOne() + + for i := 0; i < batchSize; i++ { + lambda[i] = accumulator + accumulator.Mul(&accumulator, &lambdain[i]) + } + + accumulator.Inverse(&accumulator) + + for i := batchSize - 1; i >= 0; i-- { + lambda[i].Mul(&lambda[i], &accumulator) + accumulator.Mul(&accumulator, &lambdain[i]) + } + } var d fp.Element var rr G2Affine @@ -979,36 +990,16 @@ func batchAddG2Affine(R []*G2Affine, P []G2Affine) { // add part for j := 0; j < batchSize; j++ { // computa lambda - d.Sub(&P[j].Y, &R[j].Y) + d.Sub(&(*P)[j].Y, &(*R)[j].Y) lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[j].X) - rr.X.Sub(&rr.X, &P[j].X) - d.Sub(&R[j].X, &rr.X) + rr.X.Sub(&rr.X, &(*R)[j].X) + rr.X.Sub(&rr.X, &(*P)[j].X) + d.Sub(&(*R)[j].X, &rr.X) rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[j].Y) - R[j].Set(&rr) - } -} - -// batch inversion -// similar to BatchInvertfp.Element, ignores edge cases -func batchInvertG2Affine(res, a []fp.Element) { - - var accumulator fp.Element - accumulator.SetOne() - - for i := 0; i < len(res); i++ { - res[i] = accumulator - accumulator.Mul(&accumulator, &a[i]) - } - - accumulator.Inverse(&accumulator) - - for i := len(res) - 1; i >= 0; i-- { - res[i].Mul(&res[i], &accumulator) - accumulator.Mul(&accumulator, &a[i]) + rr.Y.Sub(&rr.Y, &(*R)[j].Y) + (*R)[j].Set(&rr) } } diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index f8165ca221..1fce2c8080 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -162,7 +162,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -374,7 +374,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index 83b2c11fbe..638f888c2a 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -16,15 +16,18 @@ package bw6761 -const MAX_BATCH_SIZE = 600 +import ( + "github.com/consensys/gnark-crypto/ecc/bw6-761/fp" + "github.com/consensys/gnark-crypto/ecc/bw6-761/internal/fptower" +) -type batchOp struct { - pointID uint32 +type batchOpG1Affine struct { bucketID uint16 + point G1Affine } -func (o batchOp) isNeg() bool { - return o.pointID&1 == 1 +func (o batchOpG1Affine) isNeg() bool { + return o.bucketID&1 == 1 } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -33,7 +36,8 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, +func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( + chunk uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, @@ -47,22 +51,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // setup for the batch affine; // we do that instead of a separate object to give enough hints to the compiler to.. - // keep things on the stack. - batchSize := len(buckets) / 20 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - // bucket references - var R [MAX_BATCH_SIZE]*G1Affine + var R TPP // bucket references + var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - var P [MAX_BATCH_SIZE]G1Affine + batchSize := len(P) canAdd := func(bID uint16) bool { return !bucketIds[bID] @@ -76,76 +71,98 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, if (cptAdd) == 0 { return } - batchAddG1Affine(R[:cptAdd], P[:cptAdd]) + batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) var tmp BS bucketIds = tmp cptAdd = 0 } - add := func(op batchOp) { + addFromQueue := func(op batchOpG1Affine) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] - PP := &points[op.pointID>>1] - if PP.IsInfinity() { + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) return } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } + + add := func(bucketID uint16, PP *G1Affine, isAdd bool) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(PP) - } else { + if isAdd { BK.Set(PP) + } else { + BK.Neg(PP) } return } if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { - if op.isNeg() { - // P + -P - BK.setInfinity() - return - } - // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // P + P: doubling, which should be quite rare -- // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. // need doubling in affine implemented ? - BK.Add(BK, BK) + if isAdd { + BK.Add(BK, BK) + } else { + BK.setInfinity() + } + return } - // b.Y == -p.Y - if op.isNeg() { - // doubling . + if isAdd { + BK.setInfinity() + } else { BK.Add(BK, BK) - return } - BK.setInfinity() return } - bucketIds[op.bucketID] = true + bucketIds[bucketID] = true R[cptAdd] = BK - if op.isNeg() { - P[cptAdd].Neg(PP) - } else { + if isAdd { P[cptAdd].Set(PP) + } else { + P[cptAdd].Neg(PP) } cptAdd++ } - var queue [MAX_BATCH_SIZE]batchOp + var queue TQ qID := 0 processQueue := func() { for i := qID - 1; i >= 0; i-- { - if canAdd(queue[i].bucketID) { - add(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] - qID-- + if !canAdd(queue[i].bucketID) { + continue + } + addFromQueue(queue[i]) + if isFull() { + executeAndReset() } + queue[i] = queue[qID-1] + qID-- } } @@ -154,7 +171,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, if !canAdd(queue[i].bucketID) { return } - add(queue[i]) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -164,40 +181,47 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, for i, digit := range digits { - if digit == 0 { + if digit == 0 || points[i].IsInfinity() { continue } - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if digit&1 == 0 { + bucketID := uint16((digit >> 1)) + isAdd := digit&1 == 0 + if isAdd { // add - op.bucketID = uint16((digit >> 1) - 1) - } else { - // sub - op.bucketID = (uint16((digit >> 1))) - op.pointID += 1 + bucketID -= 1 } - if canAdd(op.bucketID) { - add(op) - if isFull() { - executeAndReset() - processTopQueue() + + if !canAdd(bucketID) { + // put it in queue + queue[qID].bucketID = bucketID + if isAdd { + queue[qID].point = points[i] + } else { + queue[qID].point.Neg(&points[i]) } - } else { - // put it in queue. - queue[qID] = op qID++ - if qID == MAX_BATCH_SIZE-1 { + + // queue is full, flush it. + if qID == len(queue)-1 { executeAndReset() processQueue() } + continue + } + + // we add the point to the batch. + add(bucketID, &points[i], isAdd) + if isFull() { + executeAndReset() + processTopQueue() } } + // empty the queue for qID != 0 { processQueue() - executeAndReset() // execute batch even if not full. + executeAndReset() } // flush items in batch. @@ -222,16 +246,44 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG1AffineC4 [1 << (4 - 1)]G1Affine -type bucketG1AffineC5 [1 << (5 - 1)]G1Affine -type bucketG1AffineC8 [1 << (8 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +// buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { - bucketG1AffineC4 | - bucketG1AffineC5 | - bucketG1AffineC8 | - bucketG1AffineC16 + bucketG1AffineC16 +} + +// array of coordinates fp.Element +type cG1Affine interface { + cG1AffineC16 +} + +// buckets: array of G1Affine points (for the batch addition) +type pG1Affine interface { + pG1AffineC16 +} + +// buckets: array of *G1Affine points (for the batch addition) +type ppG1Affine interface { + ppG1AffineC16 +} + +// buckets: array of G1Affine queue operations (for the batch addition) +type qOpsG1Affine interface { + qOpsG1AffineC16 +} +type cG1AffineC16 [640]fp.Element +type pG1AffineC16 [640]G1Affine +type ppG1AffineC16 [640]*G1Affine +type qOpsG1AffineC16 [640]batchOpG1Affine + +type batchOpG2Affine struct { + bucketID uint16 + point G2Affine +} + +func (o batchOpG2Affine) isNeg() bool { + return o.bucketID&1 == 1 } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -240,7 +292,8 @@ type ibG1Affine interface { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, +func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( + chunk uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, @@ -254,22 +307,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // setup for the batch affine; // we do that instead of a separate object to give enough hints to the compiler to.. - // keep things on the stack. - batchSize := len(buckets) / 20 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - // bucket references - var R [MAX_BATCH_SIZE]*G2Affine + var R TPP // bucket references + var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - var P [MAX_BATCH_SIZE]G2Affine + batchSize := len(P) canAdd := func(bID uint16) bool { return !bucketIds[bID] @@ -283,76 +327,98 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, if (cptAdd) == 0 { return } - batchAddG2Affine(R[:cptAdd], P[:cptAdd]) + batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) var tmp BS bucketIds = tmp cptAdd = 0 } - add := func(op batchOp) { + addFromQueue := func(op batchOpG2Affine) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] - PP := &points[op.pointID>>1] - if PP.IsInfinity() { + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() return } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } + + add := func(bucketID uint16, PP *G2Affine, isAdd bool) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(PP) - } else { + if isAdd { BK.Set(PP) + } else { + BK.Neg(PP) } return } if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { - if op.isNeg() { - // P + -P - BK.setInfinity() - return - } - // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // P + P: doubling, which should be quite rare -- // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. // need doubling in affine implemented ? - BK.Add(BK, BK) + if isAdd { + BK.Add(BK, BK) + } else { + BK.setInfinity() + } + return } - // b.Y == -p.Y - if op.isNeg() { - // doubling . + if isAdd { + BK.setInfinity() + } else { BK.Add(BK, BK) - return } - BK.setInfinity() return } - bucketIds[op.bucketID] = true + bucketIds[bucketID] = true R[cptAdd] = BK - if op.isNeg() { - P[cptAdd].Neg(PP) - } else { + if isAdd { P[cptAdd].Set(PP) + } else { + P[cptAdd].Neg(PP) } cptAdd++ } - var queue [MAX_BATCH_SIZE]batchOp + var queue TQ qID := 0 processQueue := func() { for i := qID - 1; i >= 0; i-- { - if canAdd(queue[i].bucketID) { - add(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] - qID-- + if !canAdd(queue[i].bucketID) { + continue + } + addFromQueue(queue[i]) + if isFull() { + executeAndReset() } + queue[i] = queue[qID-1] + qID-- } } @@ -361,7 +427,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, if !canAdd(queue[i].bucketID) { return } - add(queue[i]) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -371,40 +437,47 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, for i, digit := range digits { - if digit == 0 { + if digit == 0 || points[i].IsInfinity() { continue } - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if digit&1 == 0 { + bucketID := uint16((digit >> 1)) + isAdd := digit&1 == 0 + if isAdd { // add - op.bucketID = uint16((digit >> 1) - 1) - } else { - // sub - op.bucketID = (uint16((digit >> 1))) - op.pointID += 1 + bucketID -= 1 } - if canAdd(op.bucketID) { - add(op) - if isFull() { - executeAndReset() - processTopQueue() + + if !canAdd(bucketID) { + // put it in queue + queue[qID].bucketID = bucketID + if isAdd { + queue[qID].point = points[i] + } else { + queue[qID].point.Neg(&points[i]) } - } else { - // put it in queue. - queue[qID] = op qID++ - if qID == MAX_BATCH_SIZE-1 { + + // queue is full, flush it. + if qID == len(queue)-1 { executeAndReset() processQueue() } + continue + } + + // we add the point to the batch. + add(bucketID, &points[i], isAdd) + if isFull() { + executeAndReset() + processTopQueue() } } + // empty the queue for qID != 0 { processQueue() - executeAndReset() // execute batch even if not full. + executeAndReset() } // flush items in batch. @@ -429,17 +502,36 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG2AffineC4 [1 << (4 - 1)]G2Affine -type bucketG2AffineC5 [1 << (5 - 1)]G2Affine -type bucketG2AffineC8 [1 << (8 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +// buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { - bucketG2AffineC4 | - bucketG2AffineC5 | - bucketG2AffineC8 | - bucketG2AffineC16 + bucketG2AffineC16 +} + +// array of coordinates fp.Element +type cG2Affine interface { + cG2AffineC16 +} + +// buckets: array of G2Affine points (for the batch addition) +type pG2Affine interface { + pG2AffineC16 +} + +// buckets: array of *G2Affine points (for the batch addition) +type ppG2Affine interface { + ppG2AffineC16 +} + +// buckets: array of G2Affine queue operations (for the batch addition) +type qOpsG2Affine interface { + qOpsG2AffineC16 } +type cG2AffineC16 [640]fp.Element +type pG2AffineC16 [640]G2Affine +type ppG2AffineC16 [640]*G2Affine +type qOpsG2AffineC16 [640]batchOpG2Affine type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool diff --git a/internal/generator/ecc/generate.go b/internal/generator/ecc/generate.go index a4b3e9b5fd..6eb2c9f975 100644 --- a/internal/generator/ecc/generate.go +++ b/internal/generator/ecc/generate.go @@ -36,7 +36,39 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er } return n - (c * (n / c)) } + batchSize := func(c int) int { + // nbBuckets := (1 << (c - 1)) + // if c <= 12 { + // return nbBuckets/10 + 3*c + // } + // if c <= 14 { + // return nbBuckets/15 + // } + // return nbBuckets / 20 + // TODO @gbotrel / @yelhousni this need a better heuristic + // in theory, larger batch size == less inversions + // but if nbBuckets is small, then a large batch size will produce lots of collisions + // and queue ops. + // there is probably a cache-friendlyness factor at play here too. + switch c { + case 10: + return 80 + case 11: + return 150 + case 12: + return 200 + case 13: + return 350 + case 14: + return 400 + case 15: + return 500 + default: + return 640 + } + } funcs["lastC"] = lastC + funcs["batchSize"] = batchSize funcs["contains"] = func(v int, s []int) bool { for _, sv := range s { diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index 4655925e5e..c5b4eb675b 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -429,7 +429,7 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi {{- if le $c 9}} processChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] {{- else}} - processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}] + processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, qOps{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}] {{- end}} {{- if eq $c $lc}} _innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processChunk) @@ -437,7 +437,7 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi {{- if le $lc 9}} processLastChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{lastC $c}}] {{- else}} - processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}, bitSetC{{$c}}] + processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, qOps{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}] {{- end}} _innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processLastChunk) {{- end}} diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index cf1c0ce71f..d19c51dca7 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -7,26 +7,31 @@ {{ $G2TJacobianExtended := print (toLower .G2.PointName) "JacExtended" }} -const MAX_BATCH_SIZE = 600 +import ( + "github.com/consensys/gnark-crypto/ecc/{{.Name}}/internal/fptower" + "github.com/consensys/gnark-crypto/ecc/{{.Name}}/fp" +) + -type batchOp struct { - pointID uint32 - bucketID uint16 -} -func (o batchOp) isNeg() bool { - return o.pointID&1 == 1 -} -{{ template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange "LastCRange" .G1.LastCRange}} -{{ template "multiexp" dict "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange "LastCRange" .G2.LastCRange}} +{{ template "multiexp" dict "CoordType" .G1.CoordType "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange "LastCRange" .G1.LastCRange}} +{{ template "multiexp" dict "CoordType" .G2.CoordType "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange "LastCRange" .G2.LastCRange}} {{define "multiexp" }} +type batchOp{{ $.TAffine }} struct { + bucketID uint16 + point {{ $.TAffine }} +} + +func (o batchOp{{ $.TAffine }}) isNeg() bool { + return o.bucketID&1 == 1 +} // processChunk{{ $.UPointName }}BatchAffine process a chunk of the scalars during the msm // using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition @@ -34,7 +39,8 @@ func (o batchOp) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](chunk uint64, +func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, TP p{{ $.TAffine }}, TPP pp{{ $.TAffine }}, TQ qOps{{ $.TAffine }}, TC c{{ $.TAffine}}]( + chunk uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, @@ -48,24 +54,16 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c // setup for the batch affine; // we do that instead of a separate object to give enough hints to the compiler to.. - // keep things on the stack. - batchSize := len(buckets) / 20 - if batchSize > MAX_BATCH_SIZE { - batchSize = MAX_BATCH_SIZE - } - if batchSize <= 0 { - batchSize = 1 - } var bucketIds BS // bitSet to signify presence of a bucket in current batch cptAdd := 0 // count the number of bucket + point added to current batch - // bucket references - var R [MAX_BATCH_SIZE]*{{ $.TAffine }} - - // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) - var P [MAX_BATCH_SIZE]{{ $.TAffine }} - - + + var R TPP // bucket references + var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + + batchSize := len(P) + + canAdd := func(bID uint16) bool { return !bucketIds[bID] } @@ -78,78 +76,101 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c if (cptAdd) == 0 { return } - batchAdd{{ $.TAffine }}(R[:cptAdd], P[:cptAdd]) + batchAdd{{ $.TAffine }}[TP, TPP, TC](&R, &P, cptAdd) var tmp BS bucketIds = tmp cptAdd = 0 } - add := func(op batchOp) { + addFromQueue := func(op batchOp{{$.TAffine}}) { // CanAdd must be called before --> ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] - PP := &points[op.pointID>>1] - if PP.IsInfinity() { + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) return } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } + + add := func(bucketID uint16, PP *{{$.TAffine}}, isAdd bool) { + // CanAdd must be called before --> ensures bucket is not "used" in current batch + + BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { - if op.isNeg() { - BK.Neg(PP) - } else { + if isAdd { BK.Set(PP) + } else { + BK.Neg(PP) } return } if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { - if op.isNeg() { - // P + -P - BK.setInfinity() - return - } - // P + P: doubling, which should be quite rare -- may want to put it back in the batch add? + // P + P: doubling, which should be quite rare -- // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. // need doubling in affine implemented ? - BK.Add(BK, BK) + if isAdd { + BK.Add(BK, BK) + } else { + BK.setInfinity() + } + return } - // b.Y == -p.Y - if op.isNeg() { - // doubling . + if isAdd { + BK.setInfinity() + } else { BK.Add(BK, BK) - return } - BK.setInfinity() return } - bucketIds[op.bucketID] = true + bucketIds[bucketID] = true R[cptAdd] = BK - if op.isNeg() { - P[cptAdd].Neg(PP) - } else { + if isAdd { P[cptAdd].Set(PP) + } else { + P[cptAdd].Neg(PP) } cptAdd++ } - var queue [MAX_BATCH_SIZE]batchOp + var queue TQ qID := 0 processQueue := func () { for i := qID - 1; i >= 0; i-- { - if canAdd(queue[i].bucketID) { - add(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] - qID-- + if !canAdd(queue[i].bucketID) { + continue + } + addFromQueue(queue[i]) + if isFull() { + executeAndReset() } + queue[i] = queue[qID-1] + qID-- } } @@ -158,7 +179,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c if !canAdd(queue[i].bucketID) { return } - add(queue[i]) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -168,40 +189,47 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c for i, digit := range digits { - if digit == 0 { + if digit == 0 || points[i].IsInfinity() { continue } - op := batchOp{pointID: uint32(i) << 1} - // if msbWindow bit is set, we need to substract - if digit&1 == 0 { + bucketID := uint16((digit>>1)) + isAdd := digit&1 == 0 + if isAdd { // add - op.bucketID = uint16((digit>>1) - 1) - } else { - // sub - op.bucketID = (uint16((digit>>1))) - op.pointID += 1 + bucketID-=1 } - if canAdd(op.bucketID) { - add(op) - if isFull() { - executeAndReset() - processTopQueue() + + if !canAdd(bucketID) { + // put it in queue + queue[qID].bucketID = bucketID + if isAdd { + queue[qID].point = points[i] + } else { + queue[qID].point.Neg(&points[i]) } - } else { - // put it in queue. - queue[qID] = op qID++ - if qID == MAX_BATCH_SIZE - 1 { + + // queue is full, flush it. + if qID == len(queue) - 1 { executeAndReset() processQueue() } + continue + } + + // we add the point to the batch. + add(bucketID, &points[i], isAdd) + if isFull() { + executeAndReset() + processTopQueue() } } + // empty the queue for qID != 0 { processQueue() - executeAndReset() // execute batch even if not full. + executeAndReset() } // flush items in batch. @@ -227,15 +255,69 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack {{- range $c := $.CRange}} +{{- if gt $c 9}} type bucket{{ $.TAffine }}C{{$c}} [1<<({{$c}}-1)]{{ $.TAffine }} {{- end}} +{{- end}} + +// buckets: array of {{ $.TAffine }} points of size 1 << (c-1) type ib{{ $.TAffine }} interface { {{- range $i, $c := $.CRange}} + {{- if gt $c 9}} bucket{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}} {{- end}} + {{- end}} +} + +// array of coordinates {{ $.CoordType }} +type c{{ $.TAffine }} interface { + {{- range $i, $c := $.CRange}} + {{- if gt $c 9}} + c{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}} + {{- end}} + {{- end}} +} + +// buckets: array of {{ $.TAffine }} points (for the batch addition) +type p{{ $.TAffine }} interface { + {{- range $i, $c := $.CRange}} + {{- if gt $c 9}} + p{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}} + {{- end}} + {{- end}} +} + +// buckets: array of *{{ $.TAffine }} points (for the batch addition) +type pp{{ $.TAffine }} interface { + {{- range $i, $c := $.CRange}} + {{- if gt $c 9}} + pp{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}} + {{- end}} + {{- end}} } +// buckets: array of {{ $.TAffine }} queue operations (for the batch addition) +type qOps{{ $.TAffine }} interface { + {{- range $i, $c := $.CRange}} + {{- if gt $c 9}} + qOps{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}} + {{- end}} + {{- end}} +} + + +{{- range $c := $.CRange}} +{{- if gt $c 9}} +type c{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]{{ $.CoordType }} +type p{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]{{ $.TAffine }} +type pp{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]*{{ $.TAffine }} +type qOps{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]batchOp{{ $.TAffine }} + +{{- end}} +{{- end}} + + {{end }} {{- range $c := $.G1.CRange}} diff --git a/internal/generator/ecc/template/point.go.tmpl b/internal/generator/ecc/template/point.go.tmpl index 6e30ad9de8..c5455ff072 100644 --- a/internal/generator/ecc/template/point.go.tmpl +++ b/internal/generator/ecc/template/point.go.tmpl @@ -1574,21 +1574,32 @@ func BatchScalarMultiplication{{ toUpper .PointName }}(base *{{ $TAffine }}, sca // batch add affine coordinates // using batch inversion // special cases (doubling, infinity) must be filtered out before this call -func batchAdd{{ $TAffine }}(R []*{{ $TAffine }},P []{{ $TAffine }}) { - batchSize := len(R) - if batchSize == 0 { - return - } - var lambda, lambdain [MAX_BATCH_SIZE]{{.CoordType}} +func batchAdd{{ $TAffine }}[TP p{{ $TAffine }}, TPP pp{{ $TAffine }}, TC c{{ $TAffine }}](R *TPP,P *TP, batchSize int) { + var lambda, lambdain TC // add part for j := 0; j < batchSize; j++ { - lambdain[j].Sub(&P[j].X, &R[j].X) + lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X) } - // invert denominator - batchInvert{{ $TAffine }}(lambda[:batchSize], lambdain[:batchSize]) + // invert denominator using montgomery batch invert technique + { + var accumulator {{.CoordType}} + accumulator.SetOne() + + for i := 0; i < batchSize; i++ { + lambda[i] = accumulator + accumulator.Mul(&accumulator, &lambdain[i]) + } + + accumulator.Inverse(&accumulator) + + for i := batchSize - 1; i >= 0; i-- { + lambda[i].Mul(&lambda[i], &accumulator) + accumulator.Mul(&accumulator, &lambdain[i]) + } + } var d {{.CoordType}} var rr {{ $TAffine }} @@ -1596,38 +1607,17 @@ func batchAdd{{ $TAffine }}(R []*{{ $TAffine }},P []{{ $TAffine }}) { // add part for j := 0; j < batchSize; j++ { // computa lambda - d.Sub(&P[j].Y, &R[j].Y) + d.Sub(&(*P)[j].Y, &(*R)[j].Y) lambda[j].Mul(&lambda[j], &d) // compute X, Y rr.X.Square(&lambda[j]) - rr.X.Sub(&rr.X, &R[j].X) - rr.X.Sub(&rr.X, &P[j].X) - d.Sub(&R[j].X, &rr.X) + rr.X.Sub(&rr.X, &(*R)[j].X) + rr.X.Sub(&rr.X, &(*P)[j].X) + d.Sub(&(*R)[j].X, &rr.X) rr.Y.Mul(&lambda[j], &d) - rr.Y.Sub(&rr.Y, &R[j].Y) - R[j].Set(&rr) + rr.Y.Sub(&rr.Y, &(*R)[j].Y) + (*R)[j].Set(&rr) } } - - -// batch inversion -// similar to BatchInvert{{.CoordType}}, ignores edge cases -func batchInvert{{ $TAffine }}(res, a []{{.CoordType}}) { - - var accumulator {{.CoordType}} - accumulator.SetOne() - - for i := 0; i < len(res); i++ { - res[i] = accumulator - accumulator.Mul(&accumulator, &a[i]) - } - - accumulator.Inverse(&accumulator) - - for i := len(res) - 1; i >= 0; i-- { - res[i].Mul(&res[i], &accumulator) - accumulator.Mul(&accumulator, &a[i]) - } -} \ No newline at end of file From 2543ac331ad2d808ead1e960096fec15382217b2 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Mon, 14 Nov 2022 15:59:45 -0600 Subject: [PATCH 20/43] build: fix import in template --- ecc/bw6-633/multiexp_affine.go | 1 - ecc/bw6-756/multiexp_affine.go | 1 - ecc/bw6-761/multiexp_affine.go | 1 - internal/generator/ecc/template/multiexp_affine.go.tmpl | 4 +++- 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index fd164fefd2..7d3323a044 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -18,7 +18,6 @@ package bw6633 import ( "github.com/consensys/gnark-crypto/ecc/bw6-633/fp" - "github.com/consensys/gnark-crypto/ecc/bw6-633/internal/fptower" ) type batchOpG1Affine struct { diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index a9f8f12db5..739b3dca2e 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -18,7 +18,6 @@ package bw6756 import ( "github.com/consensys/gnark-crypto/ecc/bw6-756/fp" - "github.com/consensys/gnark-crypto/ecc/bw6-756/internal/fptower" ) type batchOpG1Affine struct { diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index 638f888c2a..d00ef26272 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -18,7 +18,6 @@ package bw6761 import ( "github.com/consensys/gnark-crypto/ecc/bw6-761/fp" - "github.com/consensys/gnark-crypto/ecc/bw6-761/internal/fptower" ) type batchOpG1Affine struct { diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index d19c51dca7..ccfba7f2be 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -8,8 +8,10 @@ import ( - "github.com/consensys/gnark-crypto/ecc/{{.Name}}/internal/fptower" "github.com/consensys/gnark-crypto/ecc/{{.Name}}/fp" + {{- if ne .G1.CoordType .G2.CoordType}} + "github.com/consensys/gnark-crypto/ecc/{{.Name}}/internal/fptower" + {{- end}} ) From 5733bd23d67584088db377f3d36e61cc1d105c9a Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 15 Nov 2022 08:54:51 -0600 Subject: [PATCH 21/43] feat: use nbBits+1 instead of nbWords*64 for partitionScalars --- ecc/bls12-377/multiexp.go | 134 ++++++++--------- ecc/bls12-377/multiexp_affine.go | 84 +++++++---- ecc/bls12-377/multiexp_jacobian.go | 12 +- ecc/bls12-378/multiexp.go | 138 +++++++++--------- ecc/bls12-378/multiexp_affine.go | 84 +++++++---- ecc/bls12-378/multiexp_jacobian.go | 12 +- ecc/bls12-381/multiexp.go | 76 +++++----- ecc/bls12-381/multiexp_affine.go | 84 +++++++---- ecc/bls24-315/multiexp.go | 134 ++++++++--------- ecc/bls24-315/multiexp_affine.go | 84 +++++++---- ecc/bls24-315/multiexp_jacobian.go | 12 +- ecc/bls24-317/multiexp.go | 76 +++++----- ecc/bls24-317/multiexp_affine.go | 84 +++++++---- ecc/bn254/multiexp.go | 138 +++++++++--------- ecc/bn254/multiexp_affine.go | 84 +++++++---- ecc/bn254/multiexp_jacobian.go | 12 +- ecc/bw6-633/multiexp.go | 70 ++++----- ecc/bw6-633/multiexp_affine.go | 12 +- ecc/bw6-633/multiexp_jacobian.go | 12 +- ecc/bw6-756/multiexp.go | 70 ++++----- ecc/bw6-756/multiexp_affine.go | 12 +- ecc/bw6-756/multiexp_jacobian.go | 12 +- ecc/bw6-761/multiexp.go | 74 +++++----- ecc/bw6-761/multiexp_affine.go | 12 +- ecc/bw6-761/multiexp_jacobian.go | 16 +- internal/generator/ecc/generate.go | 4 +- .../generator/ecc/template/multiexp.go.tmpl | 38 ++--- .../ecc/template/multiexp_affine.go.tmpl | 8 +- 28 files changed, 884 insertions(+), 704 deletions(-) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index 9f2a1998fc..4ec5027cfb 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -103,16 +103,16 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { + nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar + if (fr.Bits+1)%C != 0 { nbChunks++ } // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) - if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit := int((fr.Bits + 1) / cSplit) + if (fr.Bits+1)%cSplit != 0 { nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 @@ -153,53 +153,56 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] + _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] _innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] _innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] + _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 9: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] + processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] + processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] + processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] + processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16] - _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -208,8 +211,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { + nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -340,7 +343,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -351,16 +354,16 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { + nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar + if (fr.Bits+1)%C != 0 { nbChunks++ } // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) - if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit := int((fr.Bits + 1) / cSplit) + if (fr.Bits+1)%cSplit != 0 { nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 @@ -401,53 +404,56 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] + _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] _innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] _innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] + _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 9: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] + processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] + processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] + processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] + processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16] - _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -456,8 +462,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { + nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -543,8 +549,8 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := fr.Limbs * 64 / c - if (fr.Limbs*64)%c != 0 { + nbChunks := (fr.Bits + 1) / c + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -618,11 +624,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh } - // if digit is zero, no impact on result - if digit == 0 { - continue - } - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. if digit >= max { @@ -631,17 +632,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } var bits uint16 - if digit >= 0 { + + // if digit is zero, no impact on result + if digit == 0 { + continue + } else if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits - // [s.index] |= (bits << s.shift) - // if s.multiWordSelect { - // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) - // } - } } diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index c1d32b5ded..3e16fddca6 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -300,42 +300,56 @@ type ppG1Affine interface { // buckets: array of G1Affine queue operations (for the batch addition) type qOpsG1Affine interface { - qOpsG1AffineC10 | - qOpsG1AffineC11 | - qOpsG1AffineC12 | - qOpsG1AffineC13 | - qOpsG1AffineC14 | - qOpsG1AffineC15 | - qOpsG1AffineC16 + qG1AffineC10 | + qG1AffineC11 | + qG1AffineC12 | + qG1AffineC13 | + qG1AffineC14 | + qG1AffineC15 | + qG1AffineC16 } + +// batch size 80 when c = 10 type cG1AffineC10 [80]fp.Element type pG1AffineC10 [80]G1Affine type ppG1AffineC10 [80]*G1Affine -type qOpsG1AffineC10 [80]batchOpG1Affine +type qG1AffineC10 [80]batchOpG1Affine + +// batch size 150 when c = 11 type cG1AffineC11 [150]fp.Element type pG1AffineC11 [150]G1Affine type ppG1AffineC11 [150]*G1Affine -type qOpsG1AffineC11 [150]batchOpG1Affine +type qG1AffineC11 [150]batchOpG1Affine + +// batch size 200 when c = 12 type cG1AffineC12 [200]fp.Element type pG1AffineC12 [200]G1Affine type ppG1AffineC12 [200]*G1Affine -type qOpsG1AffineC12 [200]batchOpG1Affine +type qG1AffineC12 [200]batchOpG1Affine + +// batch size 350 when c = 13 type cG1AffineC13 [350]fp.Element type pG1AffineC13 [350]G1Affine type ppG1AffineC13 [350]*G1Affine -type qOpsG1AffineC13 [350]batchOpG1Affine +type qG1AffineC13 [350]batchOpG1Affine + +// batch size 400 when c = 14 type cG1AffineC14 [400]fp.Element type pG1AffineC14 [400]G1Affine type ppG1AffineC14 [400]*G1Affine -type qOpsG1AffineC14 [400]batchOpG1Affine +type qG1AffineC14 [400]batchOpG1Affine + +// batch size 500 when c = 15 type cG1AffineC15 [500]fp.Element type pG1AffineC15 [500]G1Affine type ppG1AffineC15 [500]*G1Affine -type qOpsG1AffineC15 [500]batchOpG1Affine +type qG1AffineC15 [500]batchOpG1Affine + +// batch size 640 when c = 16 type cG1AffineC16 [640]fp.Element type pG1AffineC16 [640]G1Affine type ppG1AffineC16 [640]*G1Affine -type qOpsG1AffineC16 [640]batchOpG1Affine +type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 @@ -616,42 +630,56 @@ type ppG2Affine interface { // buckets: array of G2Affine queue operations (for the batch addition) type qOpsG2Affine interface { - qOpsG2AffineC10 | - qOpsG2AffineC11 | - qOpsG2AffineC12 | - qOpsG2AffineC13 | - qOpsG2AffineC14 | - qOpsG2AffineC15 | - qOpsG2AffineC16 + qG2AffineC10 | + qG2AffineC11 | + qG2AffineC12 | + qG2AffineC13 | + qG2AffineC14 | + qG2AffineC15 | + qG2AffineC16 } + +// batch size 80 when c = 10 type cG2AffineC10 [80]fptower.E2 type pG2AffineC10 [80]G2Affine type ppG2AffineC10 [80]*G2Affine -type qOpsG2AffineC10 [80]batchOpG2Affine +type qG2AffineC10 [80]batchOpG2Affine + +// batch size 150 when c = 11 type cG2AffineC11 [150]fptower.E2 type pG2AffineC11 [150]G2Affine type ppG2AffineC11 [150]*G2Affine -type qOpsG2AffineC11 [150]batchOpG2Affine +type qG2AffineC11 [150]batchOpG2Affine + +// batch size 200 when c = 12 type cG2AffineC12 [200]fptower.E2 type pG2AffineC12 [200]G2Affine type ppG2AffineC12 [200]*G2Affine -type qOpsG2AffineC12 [200]batchOpG2Affine +type qG2AffineC12 [200]batchOpG2Affine + +// batch size 350 when c = 13 type cG2AffineC13 [350]fptower.E2 type pG2AffineC13 [350]G2Affine type ppG2AffineC13 [350]*G2Affine -type qOpsG2AffineC13 [350]batchOpG2Affine +type qG2AffineC13 [350]batchOpG2Affine + +// batch size 400 when c = 14 type cG2AffineC14 [400]fptower.E2 type pG2AffineC14 [400]G2Affine type ppG2AffineC14 [400]*G2Affine -type qOpsG2AffineC14 [400]batchOpG2Affine +type qG2AffineC14 [400]batchOpG2Affine + +// batch size 500 when c = 15 type cG2AffineC15 [500]fptower.E2 type pG2AffineC15 [500]G2Affine type ppG2AffineC15 [500]*G2Affine -type qOpsG2AffineC15 [500]batchOpG2Affine +type qG2AffineC15 [500]batchOpG2Affine + +// batch size 640 when c = 16 type cG2AffineC16 [640]fptower.E2 type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine -type qOpsG2AffineC16 [640]batchOpG2Affine +type qG2AffineC16 [640]batchOpG2Affine type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go index ae9aca6c47..e3c590196f 100644 --- a/ecc/bls12-377/multiexp_jacobian.go +++ b/ecc/bls12-377/multiexp_jacobian.go @@ -74,12 +74,12 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended type ibg1JacExtended interface { - bucketg1JacExtendedC1 | - bucketg1JacExtendedC3 | + bucketg1JacExtendedC2 | + bucketg1JacExtendedC1 | bucketg1JacExtendedC4 | bucketg1JacExtendedC5 | bucketg1JacExtendedC6 | @@ -153,12 +153,12 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended type ibg2JacExtended interface { - bucketg2JacExtendedC1 | - bucketg2JacExtendedC3 | + bucketg2JacExtendedC2 | + bucketg2JacExtendedC1 | bucketg2JacExtendedC4 | bucketg2JacExtendedC5 | bucketg2JacExtendedC6 | diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index d65962591c..bf0a181fde 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -103,16 +103,16 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { + nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar + if (fr.Bits+1)%C != 0 { nbChunks++ } // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) - if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit := int((fr.Bits + 1) / cSplit) + if (fr.Bits+1)%cSplit != 0 { nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 @@ -153,53 +153,54 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] + _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processChunk) case 6: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] + _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 9: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] + processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] + processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] + processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) + processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] + _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16] - _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -208,8 +209,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { + nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -340,7 +341,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -351,16 +352,16 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { + nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar + if (fr.Bits+1)%C != 0 { nbChunks++ } // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) - if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit := int((fr.Bits + 1) / cSplit) + if (fr.Bits+1)%cSplit != 0 { nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 @@ -401,53 +402,54 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] + _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processChunk) case 6: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] + _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 9: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] + processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] + processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] + processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) + processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] + _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16] - _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -456,8 +458,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { + nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -543,8 +545,8 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := fr.Limbs * 64 / c - if (fr.Limbs*64)%c != 0 { + nbChunks := (fr.Bits + 1) / c + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -618,11 +620,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh } - // if digit is zero, no impact on result - if digit == 0 { - continue - } - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. if digit >= max { @@ -631,17 +628,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } var bits uint16 - if digit >= 0 { + + // if digit is zero, no impact on result + if digit == 0 { + continue + } else if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits - // [s.index] |= (bits << s.shift) - // if s.multiWordSelect { - // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) - // } - } } diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index f060ffc11a..ed8968000e 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -300,42 +300,56 @@ type ppG1Affine interface { // buckets: array of G1Affine queue operations (for the batch addition) type qOpsG1Affine interface { - qOpsG1AffineC10 | - qOpsG1AffineC11 | - qOpsG1AffineC12 | - qOpsG1AffineC13 | - qOpsG1AffineC14 | - qOpsG1AffineC15 | - qOpsG1AffineC16 + qG1AffineC10 | + qG1AffineC11 | + qG1AffineC12 | + qG1AffineC13 | + qG1AffineC14 | + qG1AffineC15 | + qG1AffineC16 } + +// batch size 80 when c = 10 type cG1AffineC10 [80]fp.Element type pG1AffineC10 [80]G1Affine type ppG1AffineC10 [80]*G1Affine -type qOpsG1AffineC10 [80]batchOpG1Affine +type qG1AffineC10 [80]batchOpG1Affine + +// batch size 150 when c = 11 type cG1AffineC11 [150]fp.Element type pG1AffineC11 [150]G1Affine type ppG1AffineC11 [150]*G1Affine -type qOpsG1AffineC11 [150]batchOpG1Affine +type qG1AffineC11 [150]batchOpG1Affine + +// batch size 200 when c = 12 type cG1AffineC12 [200]fp.Element type pG1AffineC12 [200]G1Affine type ppG1AffineC12 [200]*G1Affine -type qOpsG1AffineC12 [200]batchOpG1Affine +type qG1AffineC12 [200]batchOpG1Affine + +// batch size 350 when c = 13 type cG1AffineC13 [350]fp.Element type pG1AffineC13 [350]G1Affine type ppG1AffineC13 [350]*G1Affine -type qOpsG1AffineC13 [350]batchOpG1Affine +type qG1AffineC13 [350]batchOpG1Affine + +// batch size 400 when c = 14 type cG1AffineC14 [400]fp.Element type pG1AffineC14 [400]G1Affine type ppG1AffineC14 [400]*G1Affine -type qOpsG1AffineC14 [400]batchOpG1Affine +type qG1AffineC14 [400]batchOpG1Affine + +// batch size 500 when c = 15 type cG1AffineC15 [500]fp.Element type pG1AffineC15 [500]G1Affine type ppG1AffineC15 [500]*G1Affine -type qOpsG1AffineC15 [500]batchOpG1Affine +type qG1AffineC15 [500]batchOpG1Affine + +// batch size 640 when c = 16 type cG1AffineC16 [640]fp.Element type pG1AffineC16 [640]G1Affine type ppG1AffineC16 [640]*G1Affine -type qOpsG1AffineC16 [640]batchOpG1Affine +type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 @@ -616,42 +630,56 @@ type ppG2Affine interface { // buckets: array of G2Affine queue operations (for the batch addition) type qOpsG2Affine interface { - qOpsG2AffineC10 | - qOpsG2AffineC11 | - qOpsG2AffineC12 | - qOpsG2AffineC13 | - qOpsG2AffineC14 | - qOpsG2AffineC15 | - qOpsG2AffineC16 + qG2AffineC10 | + qG2AffineC11 | + qG2AffineC12 | + qG2AffineC13 | + qG2AffineC14 | + qG2AffineC15 | + qG2AffineC16 } + +// batch size 80 when c = 10 type cG2AffineC10 [80]fptower.E2 type pG2AffineC10 [80]G2Affine type ppG2AffineC10 [80]*G2Affine -type qOpsG2AffineC10 [80]batchOpG2Affine +type qG2AffineC10 [80]batchOpG2Affine + +// batch size 150 when c = 11 type cG2AffineC11 [150]fptower.E2 type pG2AffineC11 [150]G2Affine type ppG2AffineC11 [150]*G2Affine -type qOpsG2AffineC11 [150]batchOpG2Affine +type qG2AffineC11 [150]batchOpG2Affine + +// batch size 200 when c = 12 type cG2AffineC12 [200]fptower.E2 type pG2AffineC12 [200]G2Affine type ppG2AffineC12 [200]*G2Affine -type qOpsG2AffineC12 [200]batchOpG2Affine +type qG2AffineC12 [200]batchOpG2Affine + +// batch size 350 when c = 13 type cG2AffineC13 [350]fptower.E2 type pG2AffineC13 [350]G2Affine type ppG2AffineC13 [350]*G2Affine -type qOpsG2AffineC13 [350]batchOpG2Affine +type qG2AffineC13 [350]batchOpG2Affine + +// batch size 400 when c = 14 type cG2AffineC14 [400]fptower.E2 type pG2AffineC14 [400]G2Affine type ppG2AffineC14 [400]*G2Affine -type qOpsG2AffineC14 [400]batchOpG2Affine +type qG2AffineC14 [400]batchOpG2Affine + +// batch size 500 when c = 15 type cG2AffineC15 [500]fptower.E2 type pG2AffineC15 [500]G2Affine type ppG2AffineC15 [500]*G2Affine -type qOpsG2AffineC15 [500]batchOpG2Affine +type qG2AffineC15 [500]batchOpG2Affine + +// batch size 640 when c = 16 type cG2AffineC16 [640]fptower.E2 type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine -type qOpsG2AffineC16 [640]batchOpG2Affine +type qG2AffineC16 [640]batchOpG2Affine type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go index 0637114932..97a6ac8ac0 100644 --- a/ecc/bls12-378/multiexp_jacobian.go +++ b/ecc/bls12-378/multiexp_jacobian.go @@ -74,12 +74,12 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended +type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended type ibg1JacExtended interface { - bucketg1JacExtendedC1 | - bucketg1JacExtendedC3 | + bucketg1JacExtendedC3 | + bucketg1JacExtendedC2 | bucketg1JacExtendedC4 | bucketg1JacExtendedC5 | bucketg1JacExtendedC6 | @@ -153,12 +153,12 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended +type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended type ibg2JacExtended interface { - bucketg2JacExtendedC1 | - bucketg2JacExtendedC3 | + bucketg2JacExtendedC3 | + bucketg2JacExtendedC2 | bucketg2JacExtendedC4 | bucketg2JacExtendedC5 | bucketg2JacExtendedC6 | diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index edcb161b5e..4e7c44d879 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -103,16 +103,16 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { + nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar + if (fr.Bits+1)%C != 0 { nbChunks++ } // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) - if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit := int((fr.Bits + 1) / cSplit) + if (fr.Bits+1)%cSplit != 0 { nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 @@ -174,31 +174,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10] + processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11] + processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12] + processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13] + processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14] + processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15] + processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -208,8 +208,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { + nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -340,7 +340,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -351,16 +351,16 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { + nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar + if (fr.Bits+1)%C != 0 { nbChunks++ } // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) - if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit := int((fr.Bits + 1) / cSplit) + if (fr.Bits+1)%cSplit != 0 { nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 @@ -422,31 +422,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10] + processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11] + processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12] + processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13] + processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14] + processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15] + processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -456,8 +456,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { + nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -543,8 +543,8 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := fr.Limbs * 64 / c - if (fr.Limbs*64)%c != 0 { + nbChunks := (fr.Bits + 1) / c + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -618,11 +618,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh } - // if digit is zero, no impact on result - if digit == 0 { - continue - } - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. if digit >= max { @@ -631,17 +626,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } var bits uint16 - if digit >= 0 { + + // if digit is zero, no impact on result + if digit == 0 { + continue + } else if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits - // [s.index] |= (bits << s.shift) - // if s.multiWordSelect { - // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) - // } - } } diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index da6be7a817..f6fb9e2aac 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -300,42 +300,56 @@ type ppG1Affine interface { // buckets: array of G1Affine queue operations (for the batch addition) type qOpsG1Affine interface { - qOpsG1AffineC10 | - qOpsG1AffineC11 | - qOpsG1AffineC12 | - qOpsG1AffineC13 | - qOpsG1AffineC14 | - qOpsG1AffineC15 | - qOpsG1AffineC16 + qG1AffineC10 | + qG1AffineC11 | + qG1AffineC12 | + qG1AffineC13 | + qG1AffineC14 | + qG1AffineC15 | + qG1AffineC16 } + +// batch size 80 when c = 10 type cG1AffineC10 [80]fp.Element type pG1AffineC10 [80]G1Affine type ppG1AffineC10 [80]*G1Affine -type qOpsG1AffineC10 [80]batchOpG1Affine +type qG1AffineC10 [80]batchOpG1Affine + +// batch size 150 when c = 11 type cG1AffineC11 [150]fp.Element type pG1AffineC11 [150]G1Affine type ppG1AffineC11 [150]*G1Affine -type qOpsG1AffineC11 [150]batchOpG1Affine +type qG1AffineC11 [150]batchOpG1Affine + +// batch size 200 when c = 12 type cG1AffineC12 [200]fp.Element type pG1AffineC12 [200]G1Affine type ppG1AffineC12 [200]*G1Affine -type qOpsG1AffineC12 [200]batchOpG1Affine +type qG1AffineC12 [200]batchOpG1Affine + +// batch size 350 when c = 13 type cG1AffineC13 [350]fp.Element type pG1AffineC13 [350]G1Affine type ppG1AffineC13 [350]*G1Affine -type qOpsG1AffineC13 [350]batchOpG1Affine +type qG1AffineC13 [350]batchOpG1Affine + +// batch size 400 when c = 14 type cG1AffineC14 [400]fp.Element type pG1AffineC14 [400]G1Affine type ppG1AffineC14 [400]*G1Affine -type qOpsG1AffineC14 [400]batchOpG1Affine +type qG1AffineC14 [400]batchOpG1Affine + +// batch size 500 when c = 15 type cG1AffineC15 [500]fp.Element type pG1AffineC15 [500]G1Affine type ppG1AffineC15 [500]*G1Affine -type qOpsG1AffineC15 [500]batchOpG1Affine +type qG1AffineC15 [500]batchOpG1Affine + +// batch size 640 when c = 16 type cG1AffineC16 [640]fp.Element type pG1AffineC16 [640]G1Affine type ppG1AffineC16 [640]*G1Affine -type qOpsG1AffineC16 [640]batchOpG1Affine +type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 @@ -616,42 +630,56 @@ type ppG2Affine interface { // buckets: array of G2Affine queue operations (for the batch addition) type qOpsG2Affine interface { - qOpsG2AffineC10 | - qOpsG2AffineC11 | - qOpsG2AffineC12 | - qOpsG2AffineC13 | - qOpsG2AffineC14 | - qOpsG2AffineC15 | - qOpsG2AffineC16 + qG2AffineC10 | + qG2AffineC11 | + qG2AffineC12 | + qG2AffineC13 | + qG2AffineC14 | + qG2AffineC15 | + qG2AffineC16 } + +// batch size 80 when c = 10 type cG2AffineC10 [80]fptower.E2 type pG2AffineC10 [80]G2Affine type ppG2AffineC10 [80]*G2Affine -type qOpsG2AffineC10 [80]batchOpG2Affine +type qG2AffineC10 [80]batchOpG2Affine + +// batch size 150 when c = 11 type cG2AffineC11 [150]fptower.E2 type pG2AffineC11 [150]G2Affine type ppG2AffineC11 [150]*G2Affine -type qOpsG2AffineC11 [150]batchOpG2Affine +type qG2AffineC11 [150]batchOpG2Affine + +// batch size 200 when c = 12 type cG2AffineC12 [200]fptower.E2 type pG2AffineC12 [200]G2Affine type ppG2AffineC12 [200]*G2Affine -type qOpsG2AffineC12 [200]batchOpG2Affine +type qG2AffineC12 [200]batchOpG2Affine + +// batch size 350 when c = 13 type cG2AffineC13 [350]fptower.E2 type pG2AffineC13 [350]G2Affine type ppG2AffineC13 [350]*G2Affine -type qOpsG2AffineC13 [350]batchOpG2Affine +type qG2AffineC13 [350]batchOpG2Affine + +// batch size 400 when c = 14 type cG2AffineC14 [400]fptower.E2 type pG2AffineC14 [400]G2Affine type ppG2AffineC14 [400]*G2Affine -type qOpsG2AffineC14 [400]batchOpG2Affine +type qG2AffineC14 [400]batchOpG2Affine + +// batch size 500 when c = 15 type cG2AffineC15 [500]fptower.E2 type pG2AffineC15 [500]G2Affine type ppG2AffineC15 [500]*G2Affine -type qOpsG2AffineC15 [500]batchOpG2Affine +type qG2AffineC15 [500]batchOpG2Affine + +// batch size 640 when c = 16 type cG2AffineC16 [640]fptower.E2 type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine -type qOpsG2AffineC16 [640]batchOpG2Affine +type qG2AffineC16 [640]batchOpG2Affine type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index ebaf6a86f7..828296544e 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -103,16 +103,16 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { + nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar + if (fr.Bits+1)%C != 0 { nbChunks++ } // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) - if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit := int((fr.Bits + 1) / cSplit) + if (fr.Bits+1)%cSplit != 0 { nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 @@ -153,53 +153,56 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] + _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] _innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] _innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] + _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 9: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] + processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] + processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] + processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] + processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16] - _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -208,8 +211,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { + nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -340,7 +343,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -351,16 +354,16 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { + nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar + if (fr.Bits+1)%C != 0 { nbChunks++ } // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) - if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit := int((fr.Bits + 1) / cSplit) + if (fr.Bits+1)%cSplit != 0 { nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 @@ -401,53 +404,56 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] + _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 6: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] _innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] _innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] + _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 9: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] + processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] + processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] + processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] + processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16] - _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -456,8 +462,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { + nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -543,8 +549,8 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := fr.Limbs * 64 / c - if (fr.Limbs*64)%c != 0 { + nbChunks := (fr.Bits + 1) / c + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -618,11 +624,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh } - // if digit is zero, no impact on result - if digit == 0 { - continue - } - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. if digit >= max { @@ -631,17 +632,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } var bits uint16 - if digit >= 0 { + + // if digit is zero, no impact on result + if digit == 0 { + continue + } else if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits - // [s.index] |= (bits << s.shift) - // if s.multiWordSelect { - // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) - // } - } } diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index 800c106b7d..135ccaf2b2 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -300,42 +300,56 @@ type ppG1Affine interface { // buckets: array of G1Affine queue operations (for the batch addition) type qOpsG1Affine interface { - qOpsG1AffineC10 | - qOpsG1AffineC11 | - qOpsG1AffineC12 | - qOpsG1AffineC13 | - qOpsG1AffineC14 | - qOpsG1AffineC15 | - qOpsG1AffineC16 + qG1AffineC10 | + qG1AffineC11 | + qG1AffineC12 | + qG1AffineC13 | + qG1AffineC14 | + qG1AffineC15 | + qG1AffineC16 } + +// batch size 80 when c = 10 type cG1AffineC10 [80]fp.Element type pG1AffineC10 [80]G1Affine type ppG1AffineC10 [80]*G1Affine -type qOpsG1AffineC10 [80]batchOpG1Affine +type qG1AffineC10 [80]batchOpG1Affine + +// batch size 150 when c = 11 type cG1AffineC11 [150]fp.Element type pG1AffineC11 [150]G1Affine type ppG1AffineC11 [150]*G1Affine -type qOpsG1AffineC11 [150]batchOpG1Affine +type qG1AffineC11 [150]batchOpG1Affine + +// batch size 200 when c = 12 type cG1AffineC12 [200]fp.Element type pG1AffineC12 [200]G1Affine type ppG1AffineC12 [200]*G1Affine -type qOpsG1AffineC12 [200]batchOpG1Affine +type qG1AffineC12 [200]batchOpG1Affine + +// batch size 350 when c = 13 type cG1AffineC13 [350]fp.Element type pG1AffineC13 [350]G1Affine type ppG1AffineC13 [350]*G1Affine -type qOpsG1AffineC13 [350]batchOpG1Affine +type qG1AffineC13 [350]batchOpG1Affine + +// batch size 400 when c = 14 type cG1AffineC14 [400]fp.Element type pG1AffineC14 [400]G1Affine type ppG1AffineC14 [400]*G1Affine -type qOpsG1AffineC14 [400]batchOpG1Affine +type qG1AffineC14 [400]batchOpG1Affine + +// batch size 500 when c = 15 type cG1AffineC15 [500]fp.Element type pG1AffineC15 [500]G1Affine type ppG1AffineC15 [500]*G1Affine -type qOpsG1AffineC15 [500]batchOpG1Affine +type qG1AffineC15 [500]batchOpG1Affine + +// batch size 640 when c = 16 type cG1AffineC16 [640]fp.Element type pG1AffineC16 [640]G1Affine type ppG1AffineC16 [640]*G1Affine -type qOpsG1AffineC16 [640]batchOpG1Affine +type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 @@ -616,42 +630,56 @@ type ppG2Affine interface { // buckets: array of G2Affine queue operations (for the batch addition) type qOpsG2Affine interface { - qOpsG2AffineC10 | - qOpsG2AffineC11 | - qOpsG2AffineC12 | - qOpsG2AffineC13 | - qOpsG2AffineC14 | - qOpsG2AffineC15 | - qOpsG2AffineC16 + qG2AffineC10 | + qG2AffineC11 | + qG2AffineC12 | + qG2AffineC13 | + qG2AffineC14 | + qG2AffineC15 | + qG2AffineC16 } + +// batch size 80 when c = 10 type cG2AffineC10 [80]fptower.E4 type pG2AffineC10 [80]G2Affine type ppG2AffineC10 [80]*G2Affine -type qOpsG2AffineC10 [80]batchOpG2Affine +type qG2AffineC10 [80]batchOpG2Affine + +// batch size 150 when c = 11 type cG2AffineC11 [150]fptower.E4 type pG2AffineC11 [150]G2Affine type ppG2AffineC11 [150]*G2Affine -type qOpsG2AffineC11 [150]batchOpG2Affine +type qG2AffineC11 [150]batchOpG2Affine + +// batch size 200 when c = 12 type cG2AffineC12 [200]fptower.E4 type pG2AffineC12 [200]G2Affine type ppG2AffineC12 [200]*G2Affine -type qOpsG2AffineC12 [200]batchOpG2Affine +type qG2AffineC12 [200]batchOpG2Affine + +// batch size 350 when c = 13 type cG2AffineC13 [350]fptower.E4 type pG2AffineC13 [350]G2Affine type ppG2AffineC13 [350]*G2Affine -type qOpsG2AffineC13 [350]batchOpG2Affine +type qG2AffineC13 [350]batchOpG2Affine + +// batch size 400 when c = 14 type cG2AffineC14 [400]fptower.E4 type pG2AffineC14 [400]G2Affine type ppG2AffineC14 [400]*G2Affine -type qOpsG2AffineC14 [400]batchOpG2Affine +type qG2AffineC14 [400]batchOpG2Affine + +// batch size 500 when c = 15 type cG2AffineC15 [500]fptower.E4 type pG2AffineC15 [500]G2Affine type ppG2AffineC15 [500]*G2Affine -type qOpsG2AffineC15 [500]batchOpG2Affine +type qG2AffineC15 [500]batchOpG2Affine + +// batch size 640 when c = 16 type cG2AffineC16 [640]fptower.E4 type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine -type qOpsG2AffineC16 [640]batchOpG2Affine +type qG2AffineC16 [640]batchOpG2Affine type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go index 6e3ea0e2f9..9f01ed9a7a 100644 --- a/ecc/bls24-315/multiexp_jacobian.go +++ b/ecc/bls24-315/multiexp_jacobian.go @@ -74,12 +74,12 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended type ibg1JacExtended interface { - bucketg1JacExtendedC1 | - bucketg1JacExtendedC3 | + bucketg1JacExtendedC2 | + bucketg1JacExtendedC1 | bucketg1JacExtendedC4 | bucketg1JacExtendedC5 | bucketg1JacExtendedC6 | @@ -153,12 +153,12 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended type ibg2JacExtended interface { - bucketg2JacExtendedC1 | - bucketg2JacExtendedC3 | + bucketg2JacExtendedC2 | + bucketg2JacExtendedC1 | bucketg2JacExtendedC4 | bucketg2JacExtendedC5 | bucketg2JacExtendedC6 | diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index c05f920246..c61cd372ec 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -103,16 +103,16 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { + nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar + if (fr.Bits+1)%C != 0 { nbChunks++ } // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) - if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit := int((fr.Bits + 1) / cSplit) + if (fr.Bits+1)%cSplit != 0 { nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 @@ -174,31 +174,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10] + processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11] + processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12] + processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13] + processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14] + processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15] + processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16] + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -208,8 +208,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { + nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -340,7 +340,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -351,16 +351,16 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { + nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar + if (fr.Bits+1)%C != 0 { nbChunks++ } // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) - if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit := int((fr.Bits + 1) / cSplit) + if (fr.Bits+1)%cSplit != 0 { nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 @@ -422,31 +422,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10] + processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11] + processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12] + processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13] + processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14] + processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15] + processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16] + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) default: panic("not implemented") @@ -456,8 +456,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { + nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -543,8 +543,8 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := fr.Limbs * 64 / c - if (fr.Limbs*64)%c != 0 { + nbChunks := (fr.Bits + 1) / c + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -618,11 +618,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh } - // if digit is zero, no impact on result - if digit == 0 { - continue - } - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. if digit >= max { @@ -631,17 +626,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } var bits uint16 - if digit >= 0 { + + // if digit is zero, no impact on result + if digit == 0 { + continue + } else if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits - // [s.index] |= (bits << s.shift) - // if s.multiWordSelect { - // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) - // } - } } diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index f1fb40dea1..252a20acca 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -300,42 +300,56 @@ type ppG1Affine interface { // buckets: array of G1Affine queue operations (for the batch addition) type qOpsG1Affine interface { - qOpsG1AffineC10 | - qOpsG1AffineC11 | - qOpsG1AffineC12 | - qOpsG1AffineC13 | - qOpsG1AffineC14 | - qOpsG1AffineC15 | - qOpsG1AffineC16 + qG1AffineC10 | + qG1AffineC11 | + qG1AffineC12 | + qG1AffineC13 | + qG1AffineC14 | + qG1AffineC15 | + qG1AffineC16 } + +// batch size 80 when c = 10 type cG1AffineC10 [80]fp.Element type pG1AffineC10 [80]G1Affine type ppG1AffineC10 [80]*G1Affine -type qOpsG1AffineC10 [80]batchOpG1Affine +type qG1AffineC10 [80]batchOpG1Affine + +// batch size 150 when c = 11 type cG1AffineC11 [150]fp.Element type pG1AffineC11 [150]G1Affine type ppG1AffineC11 [150]*G1Affine -type qOpsG1AffineC11 [150]batchOpG1Affine +type qG1AffineC11 [150]batchOpG1Affine + +// batch size 200 when c = 12 type cG1AffineC12 [200]fp.Element type pG1AffineC12 [200]G1Affine type ppG1AffineC12 [200]*G1Affine -type qOpsG1AffineC12 [200]batchOpG1Affine +type qG1AffineC12 [200]batchOpG1Affine + +// batch size 350 when c = 13 type cG1AffineC13 [350]fp.Element type pG1AffineC13 [350]G1Affine type ppG1AffineC13 [350]*G1Affine -type qOpsG1AffineC13 [350]batchOpG1Affine +type qG1AffineC13 [350]batchOpG1Affine + +// batch size 400 when c = 14 type cG1AffineC14 [400]fp.Element type pG1AffineC14 [400]G1Affine type ppG1AffineC14 [400]*G1Affine -type qOpsG1AffineC14 [400]batchOpG1Affine +type qG1AffineC14 [400]batchOpG1Affine + +// batch size 500 when c = 15 type cG1AffineC15 [500]fp.Element type pG1AffineC15 [500]G1Affine type ppG1AffineC15 [500]*G1Affine -type qOpsG1AffineC15 [500]batchOpG1Affine +type qG1AffineC15 [500]batchOpG1Affine + +// batch size 640 when c = 16 type cG1AffineC16 [640]fp.Element type pG1AffineC16 [640]G1Affine type ppG1AffineC16 [640]*G1Affine -type qOpsG1AffineC16 [640]batchOpG1Affine +type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 @@ -616,42 +630,56 @@ type ppG2Affine interface { // buckets: array of G2Affine queue operations (for the batch addition) type qOpsG2Affine interface { - qOpsG2AffineC10 | - qOpsG2AffineC11 | - qOpsG2AffineC12 | - qOpsG2AffineC13 | - qOpsG2AffineC14 | - qOpsG2AffineC15 | - qOpsG2AffineC16 + qG2AffineC10 | + qG2AffineC11 | + qG2AffineC12 | + qG2AffineC13 | + qG2AffineC14 | + qG2AffineC15 | + qG2AffineC16 } + +// batch size 80 when c = 10 type cG2AffineC10 [80]fptower.E4 type pG2AffineC10 [80]G2Affine type ppG2AffineC10 [80]*G2Affine -type qOpsG2AffineC10 [80]batchOpG2Affine +type qG2AffineC10 [80]batchOpG2Affine + +// batch size 150 when c = 11 type cG2AffineC11 [150]fptower.E4 type pG2AffineC11 [150]G2Affine type ppG2AffineC11 [150]*G2Affine -type qOpsG2AffineC11 [150]batchOpG2Affine +type qG2AffineC11 [150]batchOpG2Affine + +// batch size 200 when c = 12 type cG2AffineC12 [200]fptower.E4 type pG2AffineC12 [200]G2Affine type ppG2AffineC12 [200]*G2Affine -type qOpsG2AffineC12 [200]batchOpG2Affine +type qG2AffineC12 [200]batchOpG2Affine + +// batch size 350 when c = 13 type cG2AffineC13 [350]fptower.E4 type pG2AffineC13 [350]G2Affine type ppG2AffineC13 [350]*G2Affine -type qOpsG2AffineC13 [350]batchOpG2Affine +type qG2AffineC13 [350]batchOpG2Affine + +// batch size 400 when c = 14 type cG2AffineC14 [400]fptower.E4 type pG2AffineC14 [400]G2Affine type ppG2AffineC14 [400]*G2Affine -type qOpsG2AffineC14 [400]batchOpG2Affine +type qG2AffineC14 [400]batchOpG2Affine + +// batch size 500 when c = 15 type cG2AffineC15 [500]fptower.E4 type pG2AffineC15 [500]G2Affine type ppG2AffineC15 [500]*G2Affine -type qOpsG2AffineC15 [500]batchOpG2Affine +type qG2AffineC15 [500]batchOpG2Affine + +// batch size 640 when c = 16 type cG2AffineC16 [640]fptower.E4 type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine -type qOpsG2AffineC16 [640]batchOpG2Affine +type qG2AffineC16 [640]batchOpG2Affine type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index 6989e58d4b..91d94501c6 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -103,16 +103,16 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { + nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar + if (fr.Bits+1)%C != 0 { nbChunks++ } // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) - if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit := int((fr.Bits + 1) / cSplit) + if (fr.Bits+1)%cSplit != 0 { nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 @@ -153,53 +153,54 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] + _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processChunk) case 6: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] + _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 9: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] + processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] + processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] + processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) + processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] + _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16] - _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -208,8 +209,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { + nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -340,7 +341,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -351,16 +352,16 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { + nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar + if (fr.Bits+1)%C != 0 { nbChunks++ } // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) - if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit := int((fr.Bits + 1) / cSplit) + if (fr.Bits+1)%cSplit != 0 { nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 @@ -401,53 +402,54 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] + _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) + _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processChunk) case 6: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) case 7: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] + _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 9: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] + processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] + processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] + processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) + processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] + _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16] - _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -456,8 +458,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { + nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -543,8 +545,8 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := fr.Limbs * 64 / c - if (fr.Limbs*64)%c != 0 { + nbChunks := (fr.Bits + 1) / c + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -618,11 +620,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh } - // if digit is zero, no impact on result - if digit == 0 { - continue - } - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. if digit >= max { @@ -631,17 +628,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } var bits uint16 - if digit >= 0 { + + // if digit is zero, no impact on result + if digit == 0 { + continue + } else if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits - // [s.index] |= (bits << s.shift) - // if s.multiWordSelect { - // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) - // } - } } diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index 8eeded8aa1..7413da377f 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -300,42 +300,56 @@ type ppG1Affine interface { // buckets: array of G1Affine queue operations (for the batch addition) type qOpsG1Affine interface { - qOpsG1AffineC10 | - qOpsG1AffineC11 | - qOpsG1AffineC12 | - qOpsG1AffineC13 | - qOpsG1AffineC14 | - qOpsG1AffineC15 | - qOpsG1AffineC16 + qG1AffineC10 | + qG1AffineC11 | + qG1AffineC12 | + qG1AffineC13 | + qG1AffineC14 | + qG1AffineC15 | + qG1AffineC16 } + +// batch size 80 when c = 10 type cG1AffineC10 [80]fp.Element type pG1AffineC10 [80]G1Affine type ppG1AffineC10 [80]*G1Affine -type qOpsG1AffineC10 [80]batchOpG1Affine +type qG1AffineC10 [80]batchOpG1Affine + +// batch size 150 when c = 11 type cG1AffineC11 [150]fp.Element type pG1AffineC11 [150]G1Affine type ppG1AffineC11 [150]*G1Affine -type qOpsG1AffineC11 [150]batchOpG1Affine +type qG1AffineC11 [150]batchOpG1Affine + +// batch size 200 when c = 12 type cG1AffineC12 [200]fp.Element type pG1AffineC12 [200]G1Affine type ppG1AffineC12 [200]*G1Affine -type qOpsG1AffineC12 [200]batchOpG1Affine +type qG1AffineC12 [200]batchOpG1Affine + +// batch size 350 when c = 13 type cG1AffineC13 [350]fp.Element type pG1AffineC13 [350]G1Affine type ppG1AffineC13 [350]*G1Affine -type qOpsG1AffineC13 [350]batchOpG1Affine +type qG1AffineC13 [350]batchOpG1Affine + +// batch size 400 when c = 14 type cG1AffineC14 [400]fp.Element type pG1AffineC14 [400]G1Affine type ppG1AffineC14 [400]*G1Affine -type qOpsG1AffineC14 [400]batchOpG1Affine +type qG1AffineC14 [400]batchOpG1Affine + +// batch size 500 when c = 15 type cG1AffineC15 [500]fp.Element type pG1AffineC15 [500]G1Affine type ppG1AffineC15 [500]*G1Affine -type qOpsG1AffineC15 [500]batchOpG1Affine +type qG1AffineC15 [500]batchOpG1Affine + +// batch size 640 when c = 16 type cG1AffineC16 [640]fp.Element type pG1AffineC16 [640]G1Affine type ppG1AffineC16 [640]*G1Affine -type qOpsG1AffineC16 [640]batchOpG1Affine +type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 @@ -616,42 +630,56 @@ type ppG2Affine interface { // buckets: array of G2Affine queue operations (for the batch addition) type qOpsG2Affine interface { - qOpsG2AffineC10 | - qOpsG2AffineC11 | - qOpsG2AffineC12 | - qOpsG2AffineC13 | - qOpsG2AffineC14 | - qOpsG2AffineC15 | - qOpsG2AffineC16 + qG2AffineC10 | + qG2AffineC11 | + qG2AffineC12 | + qG2AffineC13 | + qG2AffineC14 | + qG2AffineC15 | + qG2AffineC16 } + +// batch size 80 when c = 10 type cG2AffineC10 [80]fptower.E2 type pG2AffineC10 [80]G2Affine type ppG2AffineC10 [80]*G2Affine -type qOpsG2AffineC10 [80]batchOpG2Affine +type qG2AffineC10 [80]batchOpG2Affine + +// batch size 150 when c = 11 type cG2AffineC11 [150]fptower.E2 type pG2AffineC11 [150]G2Affine type ppG2AffineC11 [150]*G2Affine -type qOpsG2AffineC11 [150]batchOpG2Affine +type qG2AffineC11 [150]batchOpG2Affine + +// batch size 200 when c = 12 type cG2AffineC12 [200]fptower.E2 type pG2AffineC12 [200]G2Affine type ppG2AffineC12 [200]*G2Affine -type qOpsG2AffineC12 [200]batchOpG2Affine +type qG2AffineC12 [200]batchOpG2Affine + +// batch size 350 when c = 13 type cG2AffineC13 [350]fptower.E2 type pG2AffineC13 [350]G2Affine type ppG2AffineC13 [350]*G2Affine -type qOpsG2AffineC13 [350]batchOpG2Affine +type qG2AffineC13 [350]batchOpG2Affine + +// batch size 400 when c = 14 type cG2AffineC14 [400]fptower.E2 type pG2AffineC14 [400]G2Affine type ppG2AffineC14 [400]*G2Affine -type qOpsG2AffineC14 [400]batchOpG2Affine +type qG2AffineC14 [400]batchOpG2Affine + +// batch size 500 when c = 15 type cG2AffineC15 [500]fptower.E2 type pG2AffineC15 [500]G2Affine type ppG2AffineC15 [500]*G2Affine -type qOpsG2AffineC15 [500]batchOpG2Affine +type qG2AffineC15 [500]batchOpG2Affine + +// batch size 640 when c = 16 type cG2AffineC16 [640]fptower.E2 type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine -type qOpsG2AffineC16 [640]batchOpG2Affine +type qG2AffineC16 [640]batchOpG2Affine type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go index 288063d39a..9eaccec8eb 100644 --- a/ecc/bn254/multiexp_jacobian.go +++ b/ecc/bn254/multiexp_jacobian.go @@ -74,12 +74,12 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended +type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended type ibg1JacExtended interface { - bucketg1JacExtendedC1 | - bucketg1JacExtendedC3 | + bucketg1JacExtendedC3 | + bucketg1JacExtendedC2 | bucketg1JacExtendedC4 | bucketg1JacExtendedC5 | bucketg1JacExtendedC6 | @@ -153,12 +153,12 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended +type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended type ibg2JacExtended interface { - bucketg2JacExtendedC1 | - bucketg2JacExtendedC3 | + bucketg2JacExtendedC3 | + bucketg2JacExtendedC2 | bucketg2JacExtendedC4 | bucketg2JacExtendedC5 | bucketg2JacExtendedC6 | diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index d329dacf85..b71b5a45b3 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -103,16 +103,16 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { + nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar + if (fr.Bits+1)%C != 0 { nbChunks++ } // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) - if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit := int((fr.Bits + 1) / cSplit) + if (fr.Bits+1)%cSplit != 0 { nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 @@ -156,13 +156,16 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] - _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] + _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16] - _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -171,8 +174,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { + nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -303,7 +306,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -314,16 +317,16 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { + nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar + if (fr.Bits+1)%C != 0 { nbChunks++ } // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) - if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit := int((fr.Bits + 1) / cSplit) + if (fr.Bits+1)%cSplit != 0 { nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 @@ -367,13 +370,16 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] - _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] + _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16] - _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -382,8 +388,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { + nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -469,8 +475,8 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := fr.Limbs * 64 / c - if (fr.Limbs*64)%c != 0 { + nbChunks := (fr.Bits + 1) / c + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -544,11 +550,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh } - // if digit is zero, no impact on result - if digit == 0 { - continue - } - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. if digit >= max { @@ -557,17 +558,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } var bits uint16 - if digit >= 0 { + + // if digit is zero, no impact on result + if digit == 0 { + continue + } else if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits - // [s.index] |= (bits << s.shift) - // if s.multiWordSelect { - // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) - // } - } } diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index 7d3323a044..f25c59a8b6 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -269,12 +269,14 @@ type ppG1Affine interface { // buckets: array of G1Affine queue operations (for the batch addition) type qOpsG1Affine interface { - qOpsG1AffineC16 + qG1AffineC16 } + +// batch size 640 when c = 16 type cG1AffineC16 [640]fp.Element type pG1AffineC16 [640]G1Affine type ppG1AffineC16 [640]*G1Affine -type qOpsG1AffineC16 [640]batchOpG1Affine +type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 @@ -525,12 +527,14 @@ type ppG2Affine interface { // buckets: array of G2Affine queue operations (for the batch addition) type qOpsG2Affine interface { - qOpsG2AffineC16 + qG2AffineC16 } + +// batch size 640 when c = 16 type cG2AffineC16 [640]fp.Element type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine -type qOpsG2AffineC16 [640]batchOpG2Affine +type qG2AffineC16 [640]batchOpG2Affine type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go index e39d7fc165..d31a0eaf8c 100644 --- a/ecc/bw6-633/multiexp_jacobian.go +++ b/ecc/bw6-633/multiexp_jacobian.go @@ -65,9 +65,13 @@ type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended type ibg1JacExtended interface { - bucketg1JacExtendedC4 | + bucketg1JacExtendedC1 | + bucketg1JacExtendedC12 | + bucketg1JacExtendedC4 | bucketg1JacExtendedC5 | bucketg1JacExtendedC8 | bucketg1JacExtendedC16 @@ -122,9 +126,13 @@ type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended type ibg2JacExtended interface { - bucketg2JacExtendedC4 | + bucketg2JacExtendedC1 | + bucketg2JacExtendedC12 | + bucketg2JacExtendedC4 | bucketg2JacExtendedC5 | bucketg2JacExtendedC8 | bucketg2JacExtendedC16 diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index 719bca28bf..20bc87a829 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -103,16 +103,16 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { + nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar + if (fr.Bits+1)%C != 0 { nbChunks++ } // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) - if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit := int((fr.Bits + 1) / cSplit) + if (fr.Bits+1)%cSplit != 0 { nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 @@ -153,17 +153,20 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] + _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] + _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16] - _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -172,8 +175,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { + nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -304,7 +307,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -315,16 +318,16 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { + nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar + if (fr.Bits+1)%C != 0 { nbChunks++ } // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) - if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit := int((fr.Bits + 1) / cSplit) + if (fr.Bits+1)%cSplit != 0 { nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 @@ -365,17 +368,20 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] + _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] + _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16] - _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -384,8 +390,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { + nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -471,8 +477,8 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := fr.Limbs * 64 / c - if (fr.Limbs*64)%c != 0 { + nbChunks := (fr.Bits + 1) / c + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -546,11 +552,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh } - // if digit is zero, no impact on result - if digit == 0 { - continue - } - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. if digit >= max { @@ -559,17 +560,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } var bits uint16 - if digit >= 0 { + + // if digit is zero, no impact on result + if digit == 0 { + continue + } else if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits - // [s.index] |= (bits << s.shift) - // if s.multiWordSelect { - // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) - // } - } } diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index 739b3dca2e..419acf811c 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -269,12 +269,14 @@ type ppG1Affine interface { // buckets: array of G1Affine queue operations (for the batch addition) type qOpsG1Affine interface { - qOpsG1AffineC16 + qG1AffineC16 } + +// batch size 640 when c = 16 type cG1AffineC16 [640]fp.Element type pG1AffineC16 [640]G1Affine type ppG1AffineC16 [640]*G1Affine -type qOpsG1AffineC16 [640]batchOpG1Affine +type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 @@ -525,12 +527,14 @@ type ppG2Affine interface { // buckets: array of G2Affine queue operations (for the batch addition) type qOpsG2Affine interface { - qOpsG2AffineC16 + qG2AffineC16 } + +// batch size 640 when c = 16 type cG2AffineC16 [640]fp.Element type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine -type qOpsG2AffineC16 [640]batchOpG2Affine +type qG2AffineC16 [640]batchOpG2Affine type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool diff --git a/ecc/bw6-756/multiexp_jacobian.go b/ecc/bw6-756/multiexp_jacobian.go index 0cba708584..86ccb23bbc 100644 --- a/ecc/bw6-756/multiexp_jacobian.go +++ b/ecc/bw6-756/multiexp_jacobian.go @@ -65,9 +65,13 @@ type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended +type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended type ibg1JacExtended interface { - bucketg1JacExtendedC4 | + bucketg1JacExtendedC3 | + bucketg1JacExtendedC11 | + bucketg1JacExtendedC4 | bucketg1JacExtendedC5 | bucketg1JacExtendedC8 | bucketg1JacExtendedC16 @@ -122,9 +126,13 @@ type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended +type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended type ibg2JacExtended interface { - bucketg2JacExtendedC4 | + bucketg2JacExtendedC3 | + bucketg2JacExtendedC11 | + bucketg2JacExtendedC4 | bucketg2JacExtendedC5 | bucketg2JacExtendedC8 | bucketg2JacExtendedC16 diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index 1fce2c8080..e482857188 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -103,16 +103,16 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { + nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar + if (fr.Bits+1)%C != 0 { nbChunks++ } // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) - if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit := int((fr.Bits + 1) / cSplit) + if (fr.Bits+1)%cSplit != 0 { nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 @@ -153,17 +153,20 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config case 4: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] + _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) case 5: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] + _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16] - _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) + processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -172,8 +175,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { + nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -304,7 +307,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -315,16 +318,16 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { + nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar + if (fr.Bits+1)%C != 0 { nbChunks++ } // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) - if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit := int((fr.Bits + 1) / cSplit) + if (fr.Bits+1)%cSplit != 0 { nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit * 2 @@ -365,17 +368,20 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config case 4: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] + _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) case 5: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) case 8: processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) + processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] + _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16] - _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) + processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") } @@ -384,8 +390,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { + nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -471,8 +477,8 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := fr.Limbs * 64 / c - if (fr.Limbs*64)%c != 0 { + nbChunks := (fr.Bits + 1) / c + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -546,11 +552,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh } - // if digit is zero, no impact on result - if digit == 0 { - continue - } - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. if digit >= max { @@ -559,17 +560,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } var bits uint16 - if digit >= 0 { + + // if digit is zero, no impact on result + if digit == 0 { + continue + } else if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits - // [s.index] |= (bits << s.shift) - // if s.multiWordSelect { - // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) - // } - } } diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index d00ef26272..4f039c26bc 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -269,12 +269,14 @@ type ppG1Affine interface { // buckets: array of G1Affine queue operations (for the batch addition) type qOpsG1Affine interface { - qOpsG1AffineC16 + qG1AffineC16 } + +// batch size 640 when c = 16 type cG1AffineC16 [640]fp.Element type pG1AffineC16 [640]G1Affine type ppG1AffineC16 [640]*G1Affine -type qOpsG1AffineC16 [640]batchOpG1Affine +type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 @@ -525,12 +527,14 @@ type ppG2Affine interface { // buckets: array of G2Affine queue operations (for the batch addition) type qOpsG2Affine interface { - qOpsG2AffineC16 + qG2AffineC16 } + +// batch size 640 when c = 16 type cG2AffineC16 [640]fp.Element type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine -type qOpsG2AffineC16 [640]batchOpG2Affine +type qG2AffineC16 [640]batchOpG2Affine type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool diff --git a/ecc/bw6-761/multiexp_jacobian.go b/ecc/bw6-761/multiexp_jacobian.go index af2d68b853..3039c09d6c 100644 --- a/ecc/bw6-761/multiexp_jacobian.go +++ b/ecc/bw6-761/multiexp_jacobian.go @@ -65,9 +65,15 @@ type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended +type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended type ibg1JacExtended interface { - bucketg1JacExtendedC4 | + bucketg1JacExtendedC2 | + bucketg1JacExtendedC3 | + bucketg1JacExtendedC10 | + bucketg1JacExtendedC4 | bucketg1JacExtendedC5 | bucketg1JacExtendedC8 | bucketg1JacExtendedC16 @@ -122,9 +128,15 @@ type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended +type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended type ibg2JacExtended interface { - bucketg2JacExtendedC4 | + bucketg2JacExtendedC2 | + bucketg2JacExtendedC3 | + bucketg2JacExtendedC10 | + bucketg2JacExtendedC4 | bucketg2JacExtendedC5 | bucketg2JacExtendedC8 | bucketg2JacExtendedC16 diff --git a/internal/generator/ecc/generate.go b/internal/generator/ecc/generate.go index 6eb2c9f975..f77b9d5ca8 100644 --- a/internal/generator/ecc/generate.go +++ b/internal/generator/ecc/generate.go @@ -28,9 +28,7 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er return x == reflect.ValueOf(a).Len()-1 } lastC := func(c int) int { - // lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) - // if c divides fr.Limbs * 64; - n := (conf.Fr.NbWords * 64) + n := (conf.Fr.NbBits + 1) // +1 for the potential carry of the NAF decomposition if n%c == 0 { return c } diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index c5b4eb675b..1e3e454dbe 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -42,8 +42,8 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := fr.Limbs * 64 / c - if (fr.Limbs * 64)%c != 0 { + nbChunks := (fr.Bits+1) / c + if (fr.Bits+1)%c != 0 { nbChunks++ } @@ -118,11 +118,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks digit += int(scalar[s.index+1] & s.maskHigh) << s.shiftHigh } - // if digit is zero, no impact on result - if digit == 0 { - continue - } - // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -132,17 +127,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } var bits uint16 - if digit >= 0 { + + // if digit is zero, no impact on result + if digit == 0 { + continue + } else if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } toReturn[int(chunk)*len(scalars)+i] = bits - // [s.index] |= (bits << s.shift) - // if s.multiWordSelect { - // toReturn[i][s.index+1] |= (bits >> s.shiftHigh) - // } - } } @@ -361,7 +355,7 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cc := (fr.Bits+1) * (nbPoints + (1 << (c))) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -372,16 +366,16 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem } C := bestC(nbPoints) - nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%C != 0 { + nbChunks := int((fr.Bits+1) / C) // number of c-bit radixes in a scalar + if (fr.Bits+1)%C != 0 { nbChunks++ } // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints/2) - nbChunksPostSplit := int(fr.Limbs * 64 / cSplit) - if (fr.Limbs*64)%cSplit != 0 { + nbChunksPostSplit := int((fr.Bits+1) / cSplit) + if (fr.Bits+1)%cSplit != 0 { nbChunksPostSplit++ } nbTasksPostSplit := nbChunksPostSplit*2 @@ -429,7 +423,7 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi {{- if le $c 9}} processChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] {{- else}} - processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, qOps{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}] + processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}] {{- end}} {{- if eq $c $lc}} _innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processChunk) @@ -437,7 +431,7 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi {{- if le $lc 9}} processLastChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{lastC $c}}] {{- else}} - processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, qOps{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}] + processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}] {{- end}} _innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processLastChunk) {{- end}} @@ -450,8 +444,8 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16)) *{{ $.TJacobian }} { - nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar - if (fr.Limbs*64)%c != 0 { + nbChunks := ((fr.Bits+1) / c) // number of c-bit radixes in a scalar + if (fr.Bits+1)%c != 0 { nbChunks++ } diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index ccfba7f2be..312ea0897f 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -303,19 +303,19 @@ type pp{{ $.TAffine }} interface { type qOps{{ $.TAffine }} interface { {{- range $i, $c := $.CRange}} {{- if gt $c 9}} - qOps{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}} + q{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}} {{- end}} {{- end}} } {{- range $c := $.CRange}} -{{- if gt $c 9}} +{{if gt $c 9}} +// batch size {{batchSize $c}} when c = {{$c}} type c{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]{{ $.CoordType }} type p{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]{{ $.TAffine }} type pp{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]*{{ $.TAffine }} -type qOps{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]batchOp{{ $.TAffine }} - +type q{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]batchOp{{ $.TAffine }} {{- end}} {{- end}} From 533743e184b86c2f62ede7e743149107f5e28d71 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 15 Nov 2022 09:30:57 -0600 Subject: [PATCH 22/43] style: cosmetics --- ecc/bls12-377/multiexp.go | 60 ++++++++----------- ecc/bls12-378/multiexp.go | 56 ++++++++--------- ecc/bls12-381/multiexp.go | 52 +++++++--------- ecc/bls24-315/multiexp.go | 60 ++++++++----------- ecc/bls24-317/multiexp.go | 52 +++++++--------- ecc/bn254/multiexp.go | 56 ++++++++--------- ecc/bw6-633/multiexp.go | 56 ++++++++--------- ecc/bw6-756/multiexp.go | 56 ++++++++--------- ecc/bw6-761/multiexp.go | 56 ++++++++--------- .../generator/ecc/template/multiexp.go.tmpl | 36 +++++------ 10 files changed, 234 insertions(+), 306 deletions(-) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index 4ec5027cfb..f6ea24d92c 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << c)) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -103,18 +103,13 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar - if (fr.Bits+1)%C != 0 { - nbChunks++ - } + nbChunks := int(computeNbChunks(C)) + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int((fr.Bits + 1) / cSplit) - if (fr.Bits+1)%cSplit != 0 { - nbChunksPostSplit++ - } + nbChunksPostSplit := int(computeNbChunks(cSplit)) nbTasksPostSplit := nbChunksPostSplit * 2 if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU @@ -197,11 +192,11 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") @@ -211,10 +206,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { - nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -343,7 +335,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << c)) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -354,18 +346,13 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar - if (fr.Bits+1)%C != 0 { - nbChunks++ - } + nbChunks := int(computeNbChunks(C)) + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int((fr.Bits + 1) / cSplit) - if (fr.Bits+1)%cSplit != 0 { - nbChunksPostSplit++ - } + nbChunksPostSplit := int(computeNbChunks(cSplit)) nbTasksPostSplit := nbChunksPostSplit * 2 if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU @@ -448,11 +435,11 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") @@ -462,10 +449,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { - nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -539,6 +523,17 @@ type selector struct { shiftHigh uint64 // same than shift, for index+1 } +// return number of chunks for a given window size c +func computeNbChunks(c uint64) uint64 { + // note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF + // decomposition in partitionScalars + nbChunks := (fr.Bits + 1) / c + if (fr.Bits+1)%c != 0 { + nbChunks++ + } + return nbChunks +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -549,10 +544,7 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := (fr.Bits + 1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) toReturn := make([]uint16, len(scalars)*int(nbChunks)) diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index bf0a181fde..9f16ef2b91 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << c)) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -103,18 +103,13 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar - if (fr.Bits+1)%C != 0 { - nbChunks++ - } + nbChunks := int(computeNbChunks(C)) + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int((fr.Bits + 1) / cSplit) - if (fr.Bits+1)%cSplit != 0 { - nbChunksPostSplit++ - } + nbChunksPostSplit := int(computeNbChunks(cSplit)) nbTasksPostSplit := nbChunksPostSplit * 2 if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU @@ -199,7 +194,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") @@ -209,10 +204,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { - nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -341,7 +333,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << c)) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -352,18 +344,13 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar - if (fr.Bits+1)%C != 0 { - nbChunks++ - } + nbChunks := int(computeNbChunks(C)) + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int((fr.Bits + 1) / cSplit) - if (fr.Bits+1)%cSplit != 0 { - nbChunksPostSplit++ - } + nbChunksPostSplit := int(computeNbChunks(cSplit)) nbTasksPostSplit := nbChunksPostSplit * 2 if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU @@ -448,7 +435,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") @@ -458,10 +445,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { - nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -535,6 +519,17 @@ type selector struct { shiftHigh uint64 // same than shift, for index+1 } +// return number of chunks for a given window size c +func computeNbChunks(c uint64) uint64 { + // note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF + // decomposition in partitionScalars + nbChunks := (fr.Bits + 1) / c + if (fr.Bits+1)%c != 0 { + nbChunks++ + } + return nbChunks +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -545,10 +540,7 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := (fr.Bits + 1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) toReturn := make([]uint16, len(scalars)*int(nbChunks)) diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index 4e7c44d879..a9d35fd9aa 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << c)) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -103,18 +103,13 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar - if (fr.Bits+1)%C != 0 { - nbChunks++ - } + nbChunks := int(computeNbChunks(C)) + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int((fr.Bits + 1) / cSplit) - if (fr.Bits+1)%cSplit != 0 { - nbChunksPostSplit++ - } + nbChunksPostSplit := int(computeNbChunks(cSplit)) nbTasksPostSplit := nbChunksPostSplit * 2 if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU @@ -208,10 +203,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { - nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -340,7 +332,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << c)) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -351,18 +343,13 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar - if (fr.Bits+1)%C != 0 { - nbChunks++ - } + nbChunks := int(computeNbChunks(C)) + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int((fr.Bits + 1) / cSplit) - if (fr.Bits+1)%cSplit != 0 { - nbChunksPostSplit++ - } + nbChunksPostSplit := int(computeNbChunks(cSplit)) nbTasksPostSplit := nbChunksPostSplit * 2 if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU @@ -456,10 +443,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { - nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -533,6 +517,17 @@ type selector struct { shiftHigh uint64 // same than shift, for index+1 } +// return number of chunks for a given window size c +func computeNbChunks(c uint64) uint64 { + // note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF + // decomposition in partitionScalars + nbChunks := (fr.Bits + 1) / c + if (fr.Bits+1)%c != 0 { + nbChunks++ + } + return nbChunks +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -543,10 +538,7 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := (fr.Bits + 1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) toReturn := make([]uint16, len(scalars)*int(nbChunks)) diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index 828296544e..970cd52000 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << c)) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -103,18 +103,13 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar - if (fr.Bits+1)%C != 0 { - nbChunks++ - } + nbChunks := int(computeNbChunks(C)) + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int((fr.Bits + 1) / cSplit) - if (fr.Bits+1)%cSplit != 0 { - nbChunksPostSplit++ - } + nbChunksPostSplit := int(computeNbChunks(cSplit)) nbTasksPostSplit := nbChunksPostSplit * 2 if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU @@ -197,11 +192,11 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") @@ -211,10 +206,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { - nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -343,7 +335,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << c)) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -354,18 +346,13 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar - if (fr.Bits+1)%C != 0 { - nbChunks++ - } + nbChunks := int(computeNbChunks(C)) + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int((fr.Bits + 1) / cSplit) - if (fr.Bits+1)%cSplit != 0 { - nbChunksPostSplit++ - } + nbChunksPostSplit := int(computeNbChunks(cSplit)) nbTasksPostSplit := nbChunksPostSplit * 2 if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU @@ -448,11 +435,11 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) case 15: processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") @@ -462,10 +449,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { - nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -539,6 +523,17 @@ type selector struct { shiftHigh uint64 // same than shift, for index+1 } +// return number of chunks for a given window size c +func computeNbChunks(c uint64) uint64 { + // note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF + // decomposition in partitionScalars + nbChunks := (fr.Bits + 1) / c + if (fr.Bits+1)%c != 0 { + nbChunks++ + } + return nbChunks +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -549,10 +544,7 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := (fr.Bits + 1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) toReturn := make([]uint16, len(scalars)*int(nbChunks)) diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index c61cd372ec..52f2fe363f 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << c)) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -103,18 +103,13 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar - if (fr.Bits+1)%C != 0 { - nbChunks++ - } + nbChunks := int(computeNbChunks(C)) + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int((fr.Bits + 1) / cSplit) - if (fr.Bits+1)%cSplit != 0 { - nbChunksPostSplit++ - } + nbChunksPostSplit := int(computeNbChunks(cSplit)) nbTasksPostSplit := nbChunksPostSplit * 2 if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU @@ -208,10 +203,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { - nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -340,7 +332,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << c)) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -351,18 +343,13 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar - if (fr.Bits+1)%C != 0 { - nbChunks++ - } + nbChunks := int(computeNbChunks(C)) + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int((fr.Bits + 1) / cSplit) - if (fr.Bits+1)%cSplit != 0 { - nbChunksPostSplit++ - } + nbChunksPostSplit := int(computeNbChunks(cSplit)) nbTasksPostSplit := nbChunksPostSplit * 2 if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU @@ -456,10 +443,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { - nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -533,6 +517,17 @@ type selector struct { shiftHigh uint64 // same than shift, for index+1 } +// return number of chunks for a given window size c +func computeNbChunks(c uint64) uint64 { + // note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF + // decomposition in partitionScalars + nbChunks := (fr.Bits + 1) / c + if (fr.Bits+1)%c != 0 { + nbChunks++ + } + return nbChunks +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -543,10 +538,7 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := (fr.Bits + 1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) toReturn := make([]uint16, len(scalars)*int(nbChunks)) diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index 91d94501c6..dd3d4c2183 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << c)) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -103,18 +103,13 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar - if (fr.Bits+1)%C != 0 { - nbChunks++ - } + nbChunks := int(computeNbChunks(C)) + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int((fr.Bits + 1) / cSplit) - if (fr.Bits+1)%cSplit != 0 { - nbChunksPostSplit++ - } + nbChunksPostSplit := int(computeNbChunks(cSplit)) nbTasksPostSplit := nbChunksPostSplit * 2 if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU @@ -199,7 +194,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") @@ -209,10 +204,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { - nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -341,7 +333,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << c)) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -352,18 +344,13 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar - if (fr.Bits+1)%C != 0 { - nbChunks++ - } + nbChunks := int(computeNbChunks(C)) + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int((fr.Bits + 1) / cSplit) - if (fr.Bits+1)%cSplit != 0 { - nbChunksPostSplit++ - } + nbChunksPostSplit := int(computeNbChunks(cSplit)) nbTasksPostSplit := nbChunksPostSplit * 2 if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU @@ -448,7 +435,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") @@ -458,10 +445,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { - nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -535,6 +519,17 @@ type selector struct { shiftHigh uint64 // same than shift, for index+1 } +// return number of chunks for a given window size c +func computeNbChunks(c uint64) uint64 { + // note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF + // decomposition in partitionScalars + nbChunks := (fr.Bits + 1) / c + if (fr.Bits+1)%c != 0 { + nbChunks++ + } + return nbChunks +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -545,10 +540,7 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := (fr.Bits + 1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) toReturn := make([]uint16, len(scalars)*int(nbChunks)) diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index b71b5a45b3..b16ddb3d18 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << c)) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -103,18 +103,13 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar - if (fr.Bits+1)%C != 0 { - nbChunks++ - } + nbChunks := int(computeNbChunks(C)) + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int((fr.Bits + 1) / cSplit) - if (fr.Bits+1)%cSplit != 0 { - nbChunksPostSplit++ - } + nbChunksPostSplit := int(computeNbChunks(cSplit)) nbTasksPostSplit := nbChunksPostSplit * 2 if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU @@ -164,7 +159,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") @@ -174,10 +169,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { - nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -306,7 +298,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << c)) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -317,18 +309,13 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar - if (fr.Bits+1)%C != 0 { - nbChunks++ - } + nbChunks := int(computeNbChunks(C)) + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int((fr.Bits + 1) / cSplit) - if (fr.Bits+1)%cSplit != 0 { - nbChunksPostSplit++ - } + nbChunksPostSplit := int(computeNbChunks(cSplit)) nbTasksPostSplit := nbChunksPostSplit * 2 if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU @@ -378,7 +365,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") @@ -388,10 +375,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { - nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -465,6 +449,17 @@ type selector struct { shiftHigh uint64 // same than shift, for index+1 } +// return number of chunks for a given window size c +func computeNbChunks(c uint64) uint64 { + // note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF + // decomposition in partitionScalars + nbChunks := (fr.Bits + 1) / c + if (fr.Bits+1)%c != 0 { + nbChunks++ + } + return nbChunks +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -475,10 +470,7 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := (fr.Bits + 1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) toReturn := make([]uint16, len(scalars)*int(nbChunks)) diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index 20bc87a829..be86f40a8f 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << c)) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -103,18 +103,13 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar - if (fr.Bits+1)%C != 0 { - nbChunks++ - } + nbChunks := int(computeNbChunks(C)) + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int((fr.Bits + 1) / cSplit) - if (fr.Bits+1)%cSplit != 0 { - nbChunksPostSplit++ - } + nbChunksPostSplit := int(computeNbChunks(cSplit)) nbTasksPostSplit := nbChunksPostSplit * 2 if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU @@ -165,7 +160,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") @@ -175,10 +170,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { - nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -307,7 +299,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << c)) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -318,18 +310,13 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar - if (fr.Bits+1)%C != 0 { - nbChunks++ - } + nbChunks := int(computeNbChunks(C)) + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int((fr.Bits + 1) / cSplit) - if (fr.Bits+1)%cSplit != 0 { - nbChunksPostSplit++ - } + nbChunksPostSplit := int(computeNbChunks(cSplit)) nbTasksPostSplit := nbChunksPostSplit * 2 if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU @@ -380,7 +367,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") @@ -390,10 +377,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { - nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -467,6 +451,17 @@ type selector struct { shiftHigh uint64 // same than shift, for index+1 } +// return number of chunks for a given window size c +func computeNbChunks(c uint64) uint64 { + // note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF + // decomposition in partitionScalars + nbChunks := (fr.Bits + 1) / c + if (fr.Bits+1)%c != 0 { + nbChunks++ + } + return nbChunks +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -477,10 +472,7 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := (fr.Bits + 1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) toReturn := make([]uint16, len(scalars)*int(nbChunks)) diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index e482857188..7272a5efef 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << c)) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -103,18 +103,13 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar - if (fr.Bits+1)%C != 0 { - nbChunks++ - } + nbChunks := int(computeNbChunks(C)) + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int((fr.Bits + 1) / cSplit) - if (fr.Bits+1)%cSplit != 0 { - nbChunksPostSplit++ - } + nbChunksPostSplit := int(computeNbChunks(cSplit)) nbTasksPostSplit := nbChunksPostSplit * 2 if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU @@ -165,7 +160,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + processLastChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") @@ -175,10 +170,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { - nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -307,7 +299,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := (fr.Bits + 1) * (nbPoints + (1 << (c))) + cc := (fr.Bits + 1) * (nbPoints + (1 << c)) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -318,18 +310,13 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } C := bestC(nbPoints) - nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar - if (fr.Bits+1)%C != 0 { - nbChunks++ - } + nbChunks := int(computeNbChunks(C)) + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints / 2) - nbChunksPostSplit := int((fr.Bits + 1) / cSplit) - if (fr.Bits+1)%cSplit != 0 { - nbChunksPostSplit++ - } + nbChunksPostSplit := int(computeNbChunks(cSplit)) nbTasksPostSplit := nbChunksPostSplit * 2 if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) { // if postSplit we still have less tasks than available CPU @@ -380,7 +367,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) case 16: processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + processLastChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) default: panic("not implemented") @@ -390,10 +377,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { - nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window @@ -467,6 +451,17 @@ type selector struct { shiftHigh uint64 // same than shift, for index+1 } +// return number of chunks for a given window size c +func computeNbChunks(c uint64) uint64 { + // note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF + // decomposition in partitionScalars + nbChunks := (fr.Bits + 1) / c + if (fr.Bits+1)%c != 0 { + nbChunks++ + } + return nbChunks +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -477,10 +472,7 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := (fr.Bits + 1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) toReturn := make([]uint16, len(scalars)*int(nbChunks)) diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index 1e3e454dbe..7066a005af 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -32,6 +32,17 @@ type selector struct { shiftHigh uint64 // same than shift, for index+1 } +// return number of chunks for a given window size c +func computeNbChunks(c uint64) uint64 { + // note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF + // decomposition in partitionScalars + nbChunks := (fr.Bits+1) / c + if (fr.Bits+1)%c != 0 { + nbChunks++ + } + return nbChunks +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -42,10 +53,7 @@ type selector struct { // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { // number of c-bit radixes in a scalar - nbChunks := (fr.Bits+1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) toReturn := make([]uint16, len(scalars)*int(nbChunks)) @@ -355,7 +363,7 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results min := math.MaxFloat64 for _, c := range implementedCs { - cc := (fr.Bits+1) * (nbPoints + (1 << (c))) + cc := (fr.Bits+1) * (nbPoints + (1 << c)) cost := float64(cc) / float64(c) if cost < min { min = cost @@ -366,18 +374,13 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem } C := bestC(nbPoints) - nbChunks := int((fr.Bits+1) / C) // number of c-bit radixes in a scalar - if (fr.Bits+1)%C != 0 { - nbChunks++ - } + nbChunks := int(computeNbChunks(C)) + // if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split if config.NbTasks > 1 && nbChunks < config.NbTasks { // before spliting, let's see if we endup with more tasks than thread; cSplit := bestC(nbPoints/2) - nbChunksPostSplit := int((fr.Bits+1) / cSplit) - if (fr.Bits+1)%cSplit != 0 { - nbChunksPostSplit++ - } + nbChunksPostSplit := int(computeNbChunks(cSplit)) nbTasksPostSplit := nbChunksPostSplit*2 if (nbTasksPostSplit <= config.NbTasks /2 ) || ( nbTasksPostSplit - config.NbTasks/2 ) <= ( config.NbTasks - nbChunks) { // if postSplit we still have less tasks than available CPU @@ -431,7 +434,7 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi {{- if le $lc 9}} processLastChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{lastC $c}}] {{- else}} - processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}] + processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}, bitSetC{{lastC $c}}, p{{$.TAffine}}C{{lastC $c}}, pp{{$.TAffine}}C{{lastC $c}}, q{{$.TAffine}}C{{lastC $c}}, c{{$.TAffine}}C{{lastC $c}}] {{- end}} _innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processLastChunk) {{- end}} @@ -444,10 +447,7 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint16, splitFirstChunk bool, processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16)) *{{ $.TJacobian }} { - nbChunks := ((fr.Bits+1) / c) // number of c-bit radixes in a scalar - if (fr.Bits+1)%c != 0 { - nbChunks++ - } + nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window From 59eb243242b8b7fb4cb2397e9466ac2d9c8df08b Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 15 Nov 2022 10:37:08 -0600 Subject: [PATCH 23/43] style: code cleaning --- ecc/bls12-377/multiexp.go | 209 +++++++++--------- ecc/bls12-377/multiexp_affine.go | 154 +++---------- ecc/bls12-377/multiexp_test.go | 31 +-- ecc/bls12-378/multiexp.go | 205 +++++++++-------- ecc/bls12-378/multiexp_affine.go | 154 +++---------- ecc/bls12-378/multiexp_test.go | 31 +-- ecc/bls12-381/multiexp.go | 203 +++++++++-------- ecc/bls12-381/multiexp_affine.go | 154 +++---------- ecc/bls12-381/multiexp_test.go | 31 +-- ecc/bls24-315/multiexp.go | 209 +++++++++--------- ecc/bls24-315/multiexp_affine.go | 154 +++---------- ecc/bls24-315/multiexp_test.go | 31 +-- ecc/bls24-317/multiexp.go | 203 +++++++++-------- ecc/bls24-317/multiexp_affine.go | 154 +++---------- ecc/bls24-317/multiexp_test.go | 31 +-- ecc/bn254/multiexp.go | 205 +++++++++-------- ecc/bn254/multiexp_affine.go | 154 +++---------- ecc/bn254/multiexp_test.go | 31 +-- ecc/bw6-633/multiexp.go | 99 +++++---- ecc/bw6-633/multiexp_affine.go | 154 +++---------- ecc/bw6-633/multiexp_test.go | 31 +-- ecc/bw6-756/multiexp.go | 101 +++++---- ecc/bw6-756/multiexp_affine.go | 154 +++---------- ecc/bw6-756/multiexp_test.go | 31 +-- ecc/bw6-761/multiexp.go | 101 +++++---- ecc/bw6-761/multiexp_affine.go | 154 +++---------- ecc/bw6-761/multiexp_test.go | 31 +-- .../generator/ecc/template/multiexp.go.tmpl | 67 +++--- .../ecc/template/multiexp_affine.go.tmpl | 78 ++----- .../ecc/template/tests/multiexp.go.tmpl | 16 +- 30 files changed, 1254 insertions(+), 2107 deletions(-) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index f6ea24d92c..cf970c5246 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -128,83 +128,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } } - innerMsmG1(p, int(C), points, scalars, config) - - return p, nil -} - -func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) { - // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + _innerMsmG1(p, C, points, digits, splitFirstChunk) + + return p, nil +} + +func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { + mustBeExt := false switch c { case 4: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] - _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC5] case 6: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] - _innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC6] case 7: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] - _innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC7] case 8: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC8] case 9: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] - _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC10] + } + return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC11] + } + return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] - _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC12] + } + return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] - _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC13] + } + return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] - _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC14] + } + return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] - _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC15] + } + return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] - _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } + return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: - panic("not implemented") + // panic("will not happen c != previous values is not generated by templates") + return processChunkG1Jacobian[bucketg1JacExtendedC16] } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac { nbChunks := computeNbChunks(c) @@ -221,9 +217,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) + processLastChunk := getChunkProcessorG1(lastC(c)) go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { + processChunk := getChunkProcessorG1(c) go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -231,6 +229,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + processChunk := getChunkProcessorG1(c) if !splitFirstChunk { go processChunk(0, chChunks[0], c, points, digits[:n]) } else { @@ -371,83 +370,79 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } } - innerMsmG2(p, int(C), points, scalars, config) - - return p, nil -} - -func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) { - // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + _innerMsmG2(p, C, points, digits, splitFirstChunk) + + return p, nil +} + +func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { + mustBeExt := false switch c { case 4: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] - _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC5] case 6: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] - _innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC6] case 7: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] - _innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC7] case 8: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC8] case 9: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] - _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC10] + } + return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC11] + } + return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] - _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC12] + } + return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] - _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC13] + } + return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] - _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC14] + } + return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] - _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC15] + } + return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] - _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } + return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: - panic("not implemented") + // panic("will not happen c != previous values is not generated by templates") + return processChunkG2Jacobian[bucketg2JacExtendedC16] } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac { nbChunks := computeNbChunks(c) @@ -464,9 +459,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) + processLastChunk := getChunkProcessorG2(lastC(c)) go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { + processChunk := getChunkProcessorG2(c) go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -474,6 +471,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + processChunk := getChunkProcessorG2(c) if !splitFirstChunk { go processChunk(0, chChunks[0], c, points, digits[:n]) } else { @@ -534,6 +532,17 @@ func computeNbChunks(c uint64) uint64 { return nbChunks } +// return the last window size for a scalar; if c divides the scalar size +// then it returns c +// if not, returns lastC << c +func lastC(c uint64) uint64 { + const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition + if n%c == 0 { + return c + } + return n - (c * (n / c)) +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index 3e16fddca6..8f33a80438 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -50,64 +50,30 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af } // setup for the batch affine; - // we do that instead of a separate object to give enough hints to the compiler to.. - var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptAdd := 0 // count the number of bucket + point added to current batch - - var R TPP // bucket references - var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var ( + bucketIds BS // bitSet to signify presence of a bucket in current batch + cptAdd int // count the number of bucket + point added to current batch + R TPP // bucket references + P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + queue TQ // queue of points that conflict the current batch + qID int // current position in queue + ) batchSize := len(P) - canAdd := func(bID uint16) bool { - return !bucketIds[bID] - } - isFull := func() bool { - return (cptAdd) == batchSize + return cptAdd == batchSize } executeAndReset := func() { - if (cptAdd) == 0 { - return - } batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) - var tmp BS bucketIds = tmp cptAdd = 0 } - addFromQueue := func(op batchOpG1Affine) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &buckets[op.bucketID] - // handle special cases with inf or -P / P - if BK.IsInfinity() { - BK.Set(&op.point) - return - } - if BK.X.Equal(&op.point.X) { - if BK.Y.Equal(&op.point.Y) { - // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) - return - } - BK.setInfinity() - return - } - - bucketIds[op.bucketID] = true - R[cptAdd] = BK - P[cptAdd] = op.point - cptAdd++ - } - add := func(bucketID uint16, PP *G1Affine, isAdd bool) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - + // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { @@ -149,15 +115,12 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af cptAdd++ } - var queue TQ - qID := 0 - processQueue := func() { for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { + if bucketIds[queue[i].bucketID] { continue } - addFromQueue(queue[i]) + add(queue[i].bucketID, &queue[i].point, true) if isFull() { executeAndReset() } @@ -166,19 +129,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af } } - processTopQueue := func() { - for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { - return - } - addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - qID-- - } - } - for i, digit := range digits { if digit == 0 || points[i].IsInfinity() { @@ -192,7 +142,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af bucketID -= 1 } - if !canAdd(bucketID) { + if bucketIds[bucketID] { // put it in queue queue[qID].bucketID = bucketID if isAdd { @@ -214,7 +164,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processTopQueue() } } @@ -380,64 +329,30 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af } // setup for the batch affine; - // we do that instead of a separate object to give enough hints to the compiler to.. - var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptAdd := 0 // count the number of bucket + point added to current batch - - var R TPP // bucket references - var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var ( + bucketIds BS // bitSet to signify presence of a bucket in current batch + cptAdd int // count the number of bucket + point added to current batch + R TPP // bucket references + P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + queue TQ // queue of points that conflict the current batch + qID int // current position in queue + ) batchSize := len(P) - canAdd := func(bID uint16) bool { - return !bucketIds[bID] - } - isFull := func() bool { - return (cptAdd) == batchSize + return cptAdd == batchSize } executeAndReset := func() { - if (cptAdd) == 0 { - return - } batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) - var tmp BS bucketIds = tmp cptAdd = 0 } - addFromQueue := func(op batchOpG2Affine) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &buckets[op.bucketID] - // handle special cases with inf or -P / P - if BK.IsInfinity() { - BK.Set(&op.point) - return - } - if BK.X.Equal(&op.point.X) { - if BK.Y.Equal(&op.point.Y) { - // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) - return - } - BK.setInfinity() - return - } - - bucketIds[op.bucketID] = true - R[cptAdd] = BK - P[cptAdd] = op.point - cptAdd++ - } - add := func(bucketID uint16, PP *G2Affine, isAdd bool) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - + // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { @@ -479,15 +394,12 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af cptAdd++ } - var queue TQ - qID := 0 - processQueue := func() { for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { + if bucketIds[queue[i].bucketID] { continue } - addFromQueue(queue[i]) + add(queue[i].bucketID, &queue[i].point, true) if isFull() { executeAndReset() } @@ -496,19 +408,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af } } - processTopQueue := func() { - for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { - return - } - addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - qID-- - } - } - for i, digit := range digits { if digit == 0 || points[i].IsInfinity() { @@ -522,7 +421,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af bucketID -= 1 } - if !canAdd(bucketID) { + if bucketIds[bucketID] { // put it in queue queue[qID].bucketID = bucketID if isAdd { @@ -544,7 +443,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processTopQueue() } } diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index 4b4406e922..347fb0ab6b 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -21,7 +21,6 @@ import ( "math/big" "math/bits" "math/rand" - "runtime" "sync" "testing" "time" @@ -91,8 +90,7 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) - + r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) return r16.Equal(&splitted1) && r16.Equal(&splitted2) @@ -127,8 +125,9 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i, c := range cRange { - innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test of all C + results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -163,8 +162,9 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i, c := range cRange { - innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test for all C + results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -231,7 +231,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 5; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -373,8 +373,7 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) - + r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) return r16.Equal(&splitted1) && r16.Equal(&splitted2) @@ -407,8 +406,9 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i, c := range cRange { - innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test of all C + results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -443,8 +443,9 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i, c := range cRange { - innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test for all C + results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -511,7 +512,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 5; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index 9f16ef2b91..91f7ede410 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -128,81 +128,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } } - innerMsmG1(p, int(C), points, scalars, config) - - return p, nil -} - -func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) { - // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + _innerMsmG1(p, C, points, digits, splitFirstChunk) + + return p, nil +} + +func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { + mustBeExt := false switch c { case 4: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] - _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC5] case 6: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC6] case 7: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC7] case 8: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] - _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC8] case 9: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] - _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC10] + } + return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] - _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC11] + } + return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC12] + } + return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC13] + } + return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC14] + } + return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] - _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC15] + } + return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] - _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } + return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: - panic("not implemented") + // panic("will not happen c != previous values is not generated by templates") + return processChunkG1Jacobian[bucketg1JacExtendedC16] } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac { nbChunks := computeNbChunks(c) @@ -219,9 +217,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) + processLastChunk := getChunkProcessorG1(lastC(c)) go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { + processChunk := getChunkProcessorG1(c) go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -229,6 +229,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + processChunk := getChunkProcessorG1(c) if !splitFirstChunk { go processChunk(0, chChunks[0], c, points, digits[:n]) } else { @@ -369,81 +370,79 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } } - innerMsmG2(p, int(C), points, scalars, config) - - return p, nil -} - -func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) { - // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + _innerMsmG2(p, C, points, digits, splitFirstChunk) + + return p, nil +} + +func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { + mustBeExt := false switch c { case 4: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] - _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC5] case 6: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC6] case 7: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC7] case 8: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] - _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC8] case 9: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] - _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC10] + } + return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] - _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC11] + } + return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC12] + } + return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC13] + } + return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC14] + } + return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] - _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC15] + } + return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] - _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } + return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: - panic("not implemented") + // panic("will not happen c != previous values is not generated by templates") + return processChunkG2Jacobian[bucketg2JacExtendedC16] } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac { nbChunks := computeNbChunks(c) @@ -460,9 +459,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) + processLastChunk := getChunkProcessorG2(lastC(c)) go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { + processChunk := getChunkProcessorG2(c) go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -470,6 +471,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + processChunk := getChunkProcessorG2(c) if !splitFirstChunk { go processChunk(0, chChunks[0], c, points, digits[:n]) } else { @@ -530,6 +532,17 @@ func computeNbChunks(c uint64) uint64 { return nbChunks } +// return the last window size for a scalar; if c divides the scalar size +// then it returns c +// if not, returns lastC << c +func lastC(c uint64) uint64 { + const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition + if n%c == 0 { + return c + } + return n - (c * (n / c)) +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index ed8968000e..09ee239d09 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -50,64 +50,30 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af } // setup for the batch affine; - // we do that instead of a separate object to give enough hints to the compiler to.. - var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptAdd := 0 // count the number of bucket + point added to current batch - - var R TPP // bucket references - var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var ( + bucketIds BS // bitSet to signify presence of a bucket in current batch + cptAdd int // count the number of bucket + point added to current batch + R TPP // bucket references + P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + queue TQ // queue of points that conflict the current batch + qID int // current position in queue + ) batchSize := len(P) - canAdd := func(bID uint16) bool { - return !bucketIds[bID] - } - isFull := func() bool { - return (cptAdd) == batchSize + return cptAdd == batchSize } executeAndReset := func() { - if (cptAdd) == 0 { - return - } batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) - var tmp BS bucketIds = tmp cptAdd = 0 } - addFromQueue := func(op batchOpG1Affine) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &buckets[op.bucketID] - // handle special cases with inf or -P / P - if BK.IsInfinity() { - BK.Set(&op.point) - return - } - if BK.X.Equal(&op.point.X) { - if BK.Y.Equal(&op.point.Y) { - // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) - return - } - BK.setInfinity() - return - } - - bucketIds[op.bucketID] = true - R[cptAdd] = BK - P[cptAdd] = op.point - cptAdd++ - } - add := func(bucketID uint16, PP *G1Affine, isAdd bool) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - + // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { @@ -149,15 +115,12 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af cptAdd++ } - var queue TQ - qID := 0 - processQueue := func() { for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { + if bucketIds[queue[i].bucketID] { continue } - addFromQueue(queue[i]) + add(queue[i].bucketID, &queue[i].point, true) if isFull() { executeAndReset() } @@ -166,19 +129,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af } } - processTopQueue := func() { - for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { - return - } - addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - qID-- - } - } - for i, digit := range digits { if digit == 0 || points[i].IsInfinity() { @@ -192,7 +142,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af bucketID -= 1 } - if !canAdd(bucketID) { + if bucketIds[bucketID] { // put it in queue queue[qID].bucketID = bucketID if isAdd { @@ -214,7 +164,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processTopQueue() } } @@ -380,64 +329,30 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af } // setup for the batch affine; - // we do that instead of a separate object to give enough hints to the compiler to.. - var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptAdd := 0 // count the number of bucket + point added to current batch - - var R TPP // bucket references - var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var ( + bucketIds BS // bitSet to signify presence of a bucket in current batch + cptAdd int // count the number of bucket + point added to current batch + R TPP // bucket references + P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + queue TQ // queue of points that conflict the current batch + qID int // current position in queue + ) batchSize := len(P) - canAdd := func(bID uint16) bool { - return !bucketIds[bID] - } - isFull := func() bool { - return (cptAdd) == batchSize + return cptAdd == batchSize } executeAndReset := func() { - if (cptAdd) == 0 { - return - } batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) - var tmp BS bucketIds = tmp cptAdd = 0 } - addFromQueue := func(op batchOpG2Affine) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &buckets[op.bucketID] - // handle special cases with inf or -P / P - if BK.IsInfinity() { - BK.Set(&op.point) - return - } - if BK.X.Equal(&op.point.X) { - if BK.Y.Equal(&op.point.Y) { - // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) - return - } - BK.setInfinity() - return - } - - bucketIds[op.bucketID] = true - R[cptAdd] = BK - P[cptAdd] = op.point - cptAdd++ - } - add := func(bucketID uint16, PP *G2Affine, isAdd bool) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - + // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { @@ -479,15 +394,12 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af cptAdd++ } - var queue TQ - qID := 0 - processQueue := func() { for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { + if bucketIds[queue[i].bucketID] { continue } - addFromQueue(queue[i]) + add(queue[i].bucketID, &queue[i].point, true) if isFull() { executeAndReset() } @@ -496,19 +408,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af } } - processTopQueue := func() { - for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { - return - } - addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - qID-- - } - } - for i, digit := range digits { if digit == 0 || points[i].IsInfinity() { @@ -522,7 +421,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af bucketID -= 1 } - if !canAdd(bucketID) { + if bucketIds[bucketID] { // put it in queue queue[qID].bucketID = bucketID if isAdd { @@ -544,7 +443,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processTopQueue() } } diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index b710acf39b..90b1321a9a 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -21,7 +21,6 @@ import ( "math/big" "math/bits" "math/rand" - "runtime" "sync" "testing" "time" @@ -91,8 +90,7 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) - + r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) return r16.Equal(&splitted1) && r16.Equal(&splitted2) @@ -127,8 +125,9 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i, c := range cRange { - innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test of all C + results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -163,8 +162,9 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i, c := range cRange { - innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test for all C + results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -231,7 +231,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 5; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -373,8 +373,7 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) - + r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) return r16.Equal(&splitted1) && r16.Equal(&splitted2) @@ -407,8 +406,9 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i, c := range cRange { - innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test of all C + results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -443,8 +443,9 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i, c := range cRange { - innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test for all C + results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -511,7 +512,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 5; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index a9d35fd9aa..191875391e 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -128,80 +128,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } } - innerMsmG1(p, int(C), points, scalars, config) - - return p, nil -} - -func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) { - // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + _innerMsmG1(p, C, points, digits, splitFirstChunk) + + return p, nil +} + +func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { + mustBeExt := false switch c { case 4: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC5] case 6: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC6] case 7: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC7] case 8: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC8] case 9: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC10] + } + return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC11] + } + return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC12] + } + return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC13] + } + return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC14] + } + return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC15] + } + return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] - _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } + return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: - panic("not implemented") + // panic("will not happen c != previous values is not generated by templates") + return processChunkG1Jacobian[bucketg1JacExtendedC16] } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac { nbChunks := computeNbChunks(c) @@ -218,9 +217,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) + processLastChunk := getChunkProcessorG1(lastC(c)) go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { + processChunk := getChunkProcessorG1(c) go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -228,6 +229,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + processChunk := getChunkProcessorG1(c) if !splitFirstChunk { go processChunk(0, chChunks[0], c, points, digits[:n]) } else { @@ -368,80 +370,79 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } } - innerMsmG2(p, int(C), points, scalars, config) - - return p, nil -} - -func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) { - // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + _innerMsmG2(p, C, points, digits, splitFirstChunk) + + return p, nil +} + +func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { + mustBeExt := false switch c { case 4: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC5] case 6: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC6] case 7: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC7] case 8: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC8] case 9: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC10] + } + return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC11] + } + return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC12] + } + return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC13] + } + return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC14] + } + return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC15] + } + return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] - _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } + return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: - panic("not implemented") + // panic("will not happen c != previous values is not generated by templates") + return processChunkG2Jacobian[bucketg2JacExtendedC16] } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac { nbChunks := computeNbChunks(c) @@ -458,9 +459,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) + processLastChunk := getChunkProcessorG2(lastC(c)) go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { + processChunk := getChunkProcessorG2(c) go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -468,6 +471,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + processChunk := getChunkProcessorG2(c) if !splitFirstChunk { go processChunk(0, chChunks[0], c, points, digits[:n]) } else { @@ -528,6 +532,17 @@ func computeNbChunks(c uint64) uint64 { return nbChunks } +// return the last window size for a scalar; if c divides the scalar size +// then it returns c +// if not, returns lastC << c +func lastC(c uint64) uint64 { + const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition + if n%c == 0 { + return c + } + return n - (c * (n / c)) +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index f6fb9e2aac..f452ac210e 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -50,64 +50,30 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af } // setup for the batch affine; - // we do that instead of a separate object to give enough hints to the compiler to.. - var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptAdd := 0 // count the number of bucket + point added to current batch - - var R TPP // bucket references - var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var ( + bucketIds BS // bitSet to signify presence of a bucket in current batch + cptAdd int // count the number of bucket + point added to current batch + R TPP // bucket references + P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + queue TQ // queue of points that conflict the current batch + qID int // current position in queue + ) batchSize := len(P) - canAdd := func(bID uint16) bool { - return !bucketIds[bID] - } - isFull := func() bool { - return (cptAdd) == batchSize + return cptAdd == batchSize } executeAndReset := func() { - if (cptAdd) == 0 { - return - } batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) - var tmp BS bucketIds = tmp cptAdd = 0 } - addFromQueue := func(op batchOpG1Affine) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &buckets[op.bucketID] - // handle special cases with inf or -P / P - if BK.IsInfinity() { - BK.Set(&op.point) - return - } - if BK.X.Equal(&op.point.X) { - if BK.Y.Equal(&op.point.Y) { - // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) - return - } - BK.setInfinity() - return - } - - bucketIds[op.bucketID] = true - R[cptAdd] = BK - P[cptAdd] = op.point - cptAdd++ - } - add := func(bucketID uint16, PP *G1Affine, isAdd bool) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - + // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { @@ -149,15 +115,12 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af cptAdd++ } - var queue TQ - qID := 0 - processQueue := func() { for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { + if bucketIds[queue[i].bucketID] { continue } - addFromQueue(queue[i]) + add(queue[i].bucketID, &queue[i].point, true) if isFull() { executeAndReset() } @@ -166,19 +129,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af } } - processTopQueue := func() { - for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { - return - } - addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - qID-- - } - } - for i, digit := range digits { if digit == 0 || points[i].IsInfinity() { @@ -192,7 +142,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af bucketID -= 1 } - if !canAdd(bucketID) { + if bucketIds[bucketID] { // put it in queue queue[qID].bucketID = bucketID if isAdd { @@ -214,7 +164,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processTopQueue() } } @@ -380,64 +329,30 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af } // setup for the batch affine; - // we do that instead of a separate object to give enough hints to the compiler to.. - var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptAdd := 0 // count the number of bucket + point added to current batch - - var R TPP // bucket references - var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var ( + bucketIds BS // bitSet to signify presence of a bucket in current batch + cptAdd int // count the number of bucket + point added to current batch + R TPP // bucket references + P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + queue TQ // queue of points that conflict the current batch + qID int // current position in queue + ) batchSize := len(P) - canAdd := func(bID uint16) bool { - return !bucketIds[bID] - } - isFull := func() bool { - return (cptAdd) == batchSize + return cptAdd == batchSize } executeAndReset := func() { - if (cptAdd) == 0 { - return - } batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) - var tmp BS bucketIds = tmp cptAdd = 0 } - addFromQueue := func(op batchOpG2Affine) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &buckets[op.bucketID] - // handle special cases with inf or -P / P - if BK.IsInfinity() { - BK.Set(&op.point) - return - } - if BK.X.Equal(&op.point.X) { - if BK.Y.Equal(&op.point.Y) { - // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) - return - } - BK.setInfinity() - return - } - - bucketIds[op.bucketID] = true - R[cptAdd] = BK - P[cptAdd] = op.point - cptAdd++ - } - add := func(bucketID uint16, PP *G2Affine, isAdd bool) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - + // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { @@ -479,15 +394,12 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af cptAdd++ } - var queue TQ - qID := 0 - processQueue := func() { for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { + if bucketIds[queue[i].bucketID] { continue } - addFromQueue(queue[i]) + add(queue[i].bucketID, &queue[i].point, true) if isFull() { executeAndReset() } @@ -496,19 +408,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af } } - processTopQueue := func() { - for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { - return - } - addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - qID-- - } - } - for i, digit := range digits { if digit == 0 || points[i].IsInfinity() { @@ -522,7 +421,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af bucketID -= 1 } - if !canAdd(bucketID) { + if bucketIds[bucketID] { // put it in queue queue[qID].bucketID = bucketID if isAdd { @@ -544,7 +443,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processTopQueue() } } diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index 1f8539c0bf..5b0b8eb7cc 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -21,7 +21,6 @@ import ( "math/big" "math/bits" "math/rand" - "runtime" "sync" "testing" "time" @@ -91,8 +90,7 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) - + r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) return r16.Equal(&splitted1) && r16.Equal(&splitted2) @@ -127,8 +125,9 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i, c := range cRange { - innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test of all C + results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -163,8 +162,9 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i, c := range cRange { - innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test for all C + results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -231,7 +231,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 5; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -373,8 +373,7 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) - + r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) return r16.Equal(&splitted1) && r16.Equal(&splitted2) @@ -407,8 +406,9 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i, c := range cRange { - innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test of all C + results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -443,8 +443,9 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i, c := range cRange { - innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test for all C + results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -511,7 +512,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 5; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index 970cd52000..894b89a73d 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -128,83 +128,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } } - innerMsmG1(p, int(C), points, scalars, config) - - return p, nil -} - -func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) { - // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + _innerMsmG1(p, C, points, digits, splitFirstChunk) + + return p, nil +} + +func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { + mustBeExt := false switch c { case 4: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] - _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC5] case 6: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] - _innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC6] case 7: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] - _innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC7] case 8: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC8] case 9: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] - _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC10] + } + return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC11] + } + return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] - _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC12] + } + return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] - _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC13] + } + return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] - _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC14] + } + return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] - _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC15] + } + return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] - _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } + return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: - panic("not implemented") + // panic("will not happen c != previous values is not generated by templates") + return processChunkG1Jacobian[bucketg1JacExtendedC16] } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac { nbChunks := computeNbChunks(c) @@ -221,9 +217,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) + processLastChunk := getChunkProcessorG1(lastC(c)) go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { + processChunk := getChunkProcessorG1(c) go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -231,6 +229,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + processChunk := getChunkProcessorG1(c) if !splitFirstChunk { go processChunk(0, chChunks[0], c, points, digits[:n]) } else { @@ -371,83 +370,79 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } } - innerMsmG2(p, int(C), points, scalars, config) - - return p, nil -} - -func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) { - // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + _innerMsmG2(p, C, points, digits, splitFirstChunk) + + return p, nil +} + +func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { + mustBeExt := false switch c { case 4: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] - _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC5] case 6: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] - _innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC6] case 7: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] - _innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC7] case 8: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC8] case 9: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] - _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC10] + } + return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC11] + } + return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] - _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC12] + } + return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] - _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC13] + } + return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] - _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC14] + } + return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] - _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC15] + } + return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] - _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } + return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: - panic("not implemented") + // panic("will not happen c != previous values is not generated by templates") + return processChunkG2Jacobian[bucketg2JacExtendedC16] } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac { nbChunks := computeNbChunks(c) @@ -464,9 +459,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) + processLastChunk := getChunkProcessorG2(lastC(c)) go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { + processChunk := getChunkProcessorG2(c) go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -474,6 +471,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + processChunk := getChunkProcessorG2(c) if !splitFirstChunk { go processChunk(0, chChunks[0], c, points, digits[:n]) } else { @@ -534,6 +532,17 @@ func computeNbChunks(c uint64) uint64 { return nbChunks } +// return the last window size for a scalar; if c divides the scalar size +// then it returns c +// if not, returns lastC << c +func lastC(c uint64) uint64 { + const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition + if n%c == 0 { + return c + } + return n - (c * (n / c)) +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index 135ccaf2b2..fa683c78a1 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -50,64 +50,30 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af } // setup for the batch affine; - // we do that instead of a separate object to give enough hints to the compiler to.. - var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptAdd := 0 // count the number of bucket + point added to current batch - - var R TPP // bucket references - var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var ( + bucketIds BS // bitSet to signify presence of a bucket in current batch + cptAdd int // count the number of bucket + point added to current batch + R TPP // bucket references + P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + queue TQ // queue of points that conflict the current batch + qID int // current position in queue + ) batchSize := len(P) - canAdd := func(bID uint16) bool { - return !bucketIds[bID] - } - isFull := func() bool { - return (cptAdd) == batchSize + return cptAdd == batchSize } executeAndReset := func() { - if (cptAdd) == 0 { - return - } batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) - var tmp BS bucketIds = tmp cptAdd = 0 } - addFromQueue := func(op batchOpG1Affine) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &buckets[op.bucketID] - // handle special cases with inf or -P / P - if BK.IsInfinity() { - BK.Set(&op.point) - return - } - if BK.X.Equal(&op.point.X) { - if BK.Y.Equal(&op.point.Y) { - // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) - return - } - BK.setInfinity() - return - } - - bucketIds[op.bucketID] = true - R[cptAdd] = BK - P[cptAdd] = op.point - cptAdd++ - } - add := func(bucketID uint16, PP *G1Affine, isAdd bool) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - + // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { @@ -149,15 +115,12 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af cptAdd++ } - var queue TQ - qID := 0 - processQueue := func() { for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { + if bucketIds[queue[i].bucketID] { continue } - addFromQueue(queue[i]) + add(queue[i].bucketID, &queue[i].point, true) if isFull() { executeAndReset() } @@ -166,19 +129,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af } } - processTopQueue := func() { - for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { - return - } - addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - qID-- - } - } - for i, digit := range digits { if digit == 0 || points[i].IsInfinity() { @@ -192,7 +142,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af bucketID -= 1 } - if !canAdd(bucketID) { + if bucketIds[bucketID] { // put it in queue queue[qID].bucketID = bucketID if isAdd { @@ -214,7 +164,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processTopQueue() } } @@ -380,64 +329,30 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af } // setup for the batch affine; - // we do that instead of a separate object to give enough hints to the compiler to.. - var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptAdd := 0 // count the number of bucket + point added to current batch - - var R TPP // bucket references - var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var ( + bucketIds BS // bitSet to signify presence of a bucket in current batch + cptAdd int // count the number of bucket + point added to current batch + R TPP // bucket references + P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + queue TQ // queue of points that conflict the current batch + qID int // current position in queue + ) batchSize := len(P) - canAdd := func(bID uint16) bool { - return !bucketIds[bID] - } - isFull := func() bool { - return (cptAdd) == batchSize + return cptAdd == batchSize } executeAndReset := func() { - if (cptAdd) == 0 { - return - } batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) - var tmp BS bucketIds = tmp cptAdd = 0 } - addFromQueue := func(op batchOpG2Affine) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &buckets[op.bucketID] - // handle special cases with inf or -P / P - if BK.IsInfinity() { - BK.Set(&op.point) - return - } - if BK.X.Equal(&op.point.X) { - if BK.Y.Equal(&op.point.Y) { - // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) - return - } - BK.setInfinity() - return - } - - bucketIds[op.bucketID] = true - R[cptAdd] = BK - P[cptAdd] = op.point - cptAdd++ - } - add := func(bucketID uint16, PP *G2Affine, isAdd bool) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - + // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { @@ -479,15 +394,12 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af cptAdd++ } - var queue TQ - qID := 0 - processQueue := func() { for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { + if bucketIds[queue[i].bucketID] { continue } - addFromQueue(queue[i]) + add(queue[i].bucketID, &queue[i].point, true) if isFull() { executeAndReset() } @@ -496,19 +408,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af } } - processTopQueue := func() { - for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { - return - } - addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - qID-- - } - } - for i, digit := range digits { if digit == 0 || points[i].IsInfinity() { @@ -522,7 +421,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af bucketID -= 1 } - if !canAdd(bucketID) { + if bucketIds[bucketID] { // put it in queue queue[qID].bucketID = bucketID if isAdd { @@ -544,7 +443,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processTopQueue() } } diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index 9307ba079d..9bda4cefd3 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -21,7 +21,6 @@ import ( "math/big" "math/bits" "math/rand" - "runtime" "sync" "testing" "time" @@ -91,8 +90,7 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) - + r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) return r16.Equal(&splitted1) && r16.Equal(&splitted2) @@ -127,8 +125,9 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i, c := range cRange { - innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test of all C + results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -163,8 +162,9 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i, c := range cRange { - innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test for all C + results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -231,7 +231,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 5; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -373,8 +373,7 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) - + r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) return r16.Equal(&splitted1) && r16.Equal(&splitted2) @@ -407,8 +406,9 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i, c := range cRange { - innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test of all C + results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -443,8 +443,9 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i, c := range cRange { - innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test for all C + results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -511,7 +512,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 5; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index 52f2fe363f..7cd831c238 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -128,80 +128,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } } - innerMsmG1(p, int(C), points, scalars, config) - - return p, nil -} - -func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) { - // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + _innerMsmG1(p, C, points, digits, splitFirstChunk) + + return p, nil +} + +func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { + mustBeExt := false switch c { case 4: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC5] case 6: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC6] case 7: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC7] case 8: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC8] case 9: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC10] + } + return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC11] + } + return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC12] + } + return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC13] + } + return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC14] + } + return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC15] + } + return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] - _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } + return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: - panic("not implemented") + // panic("will not happen c != previous values is not generated by templates") + return processChunkG1Jacobian[bucketg1JacExtendedC16] } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac { nbChunks := computeNbChunks(c) @@ -218,9 +217,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) + processLastChunk := getChunkProcessorG1(lastC(c)) go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { + processChunk := getChunkProcessorG1(c) go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -228,6 +229,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + processChunk := getChunkProcessorG1(c) if !splitFirstChunk { go processChunk(0, chChunks[0], c, points, digits[:n]) } else { @@ -368,80 +370,79 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } } - innerMsmG2(p, int(C), points, scalars, config) - - return p, nil -} - -func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) { - // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + _innerMsmG2(p, C, points, digits, splitFirstChunk) + + return p, nil +} + +func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { + mustBeExt := false switch c { case 4: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC5] case 6: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC6] case 7: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC7] case 8: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC8] case 9: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC10] + } + return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC11] + } + return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC12] + } + return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC13] + } + return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC14] + } + return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC15] + } + return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] - _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } + return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: - panic("not implemented") + // panic("will not happen c != previous values is not generated by templates") + return processChunkG2Jacobian[bucketg2JacExtendedC16] } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac { nbChunks := computeNbChunks(c) @@ -458,9 +459,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) + processLastChunk := getChunkProcessorG2(lastC(c)) go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { + processChunk := getChunkProcessorG2(c) go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -468,6 +471,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + processChunk := getChunkProcessorG2(c) if !splitFirstChunk { go processChunk(0, chChunks[0], c, points, digits[:n]) } else { @@ -528,6 +532,17 @@ func computeNbChunks(c uint64) uint64 { return nbChunks } +// return the last window size for a scalar; if c divides the scalar size +// then it returns c +// if not, returns lastC << c +func lastC(c uint64) uint64 { + const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition + if n%c == 0 { + return c + } + return n - (c * (n / c)) +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index 252a20acca..913b2e9308 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -50,64 +50,30 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af } // setup for the batch affine; - // we do that instead of a separate object to give enough hints to the compiler to.. - var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptAdd := 0 // count the number of bucket + point added to current batch - - var R TPP // bucket references - var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var ( + bucketIds BS // bitSet to signify presence of a bucket in current batch + cptAdd int // count the number of bucket + point added to current batch + R TPP // bucket references + P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + queue TQ // queue of points that conflict the current batch + qID int // current position in queue + ) batchSize := len(P) - canAdd := func(bID uint16) bool { - return !bucketIds[bID] - } - isFull := func() bool { - return (cptAdd) == batchSize + return cptAdd == batchSize } executeAndReset := func() { - if (cptAdd) == 0 { - return - } batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) - var tmp BS bucketIds = tmp cptAdd = 0 } - addFromQueue := func(op batchOpG1Affine) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &buckets[op.bucketID] - // handle special cases with inf or -P / P - if BK.IsInfinity() { - BK.Set(&op.point) - return - } - if BK.X.Equal(&op.point.X) { - if BK.Y.Equal(&op.point.Y) { - // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) - return - } - BK.setInfinity() - return - } - - bucketIds[op.bucketID] = true - R[cptAdd] = BK - P[cptAdd] = op.point - cptAdd++ - } - add := func(bucketID uint16, PP *G1Affine, isAdd bool) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - + // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { @@ -149,15 +115,12 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af cptAdd++ } - var queue TQ - qID := 0 - processQueue := func() { for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { + if bucketIds[queue[i].bucketID] { continue } - addFromQueue(queue[i]) + add(queue[i].bucketID, &queue[i].point, true) if isFull() { executeAndReset() } @@ -166,19 +129,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af } } - processTopQueue := func() { - for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { - return - } - addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - qID-- - } - } - for i, digit := range digits { if digit == 0 || points[i].IsInfinity() { @@ -192,7 +142,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af bucketID -= 1 } - if !canAdd(bucketID) { + if bucketIds[bucketID] { // put it in queue queue[qID].bucketID = bucketID if isAdd { @@ -214,7 +164,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processTopQueue() } } @@ -380,64 +329,30 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af } // setup for the batch affine; - // we do that instead of a separate object to give enough hints to the compiler to.. - var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptAdd := 0 // count the number of bucket + point added to current batch - - var R TPP // bucket references - var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var ( + bucketIds BS // bitSet to signify presence of a bucket in current batch + cptAdd int // count the number of bucket + point added to current batch + R TPP // bucket references + P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + queue TQ // queue of points that conflict the current batch + qID int // current position in queue + ) batchSize := len(P) - canAdd := func(bID uint16) bool { - return !bucketIds[bID] - } - isFull := func() bool { - return (cptAdd) == batchSize + return cptAdd == batchSize } executeAndReset := func() { - if (cptAdd) == 0 { - return - } batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) - var tmp BS bucketIds = tmp cptAdd = 0 } - addFromQueue := func(op batchOpG2Affine) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &buckets[op.bucketID] - // handle special cases with inf or -P / P - if BK.IsInfinity() { - BK.Set(&op.point) - return - } - if BK.X.Equal(&op.point.X) { - if BK.Y.Equal(&op.point.Y) { - // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) - return - } - BK.setInfinity() - return - } - - bucketIds[op.bucketID] = true - R[cptAdd] = BK - P[cptAdd] = op.point - cptAdd++ - } - add := func(bucketID uint16, PP *G2Affine, isAdd bool) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - + // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { @@ -479,15 +394,12 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af cptAdd++ } - var queue TQ - qID := 0 - processQueue := func() { for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { + if bucketIds[queue[i].bucketID] { continue } - addFromQueue(queue[i]) + add(queue[i].bucketID, &queue[i].point, true) if isFull() { executeAndReset() } @@ -496,19 +408,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af } } - processTopQueue := func() { - for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { - return - } - addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - qID-- - } - } - for i, digit := range digits { if digit == 0 || points[i].IsInfinity() { @@ -522,7 +421,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af bucketID -= 1 } - if !canAdd(bucketID) { + if bucketIds[bucketID] { // put it in queue queue[qID].bucketID = bucketID if isAdd { @@ -544,7 +443,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processTopQueue() } } diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index 5945e42e8a..c166598a34 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -21,7 +21,6 @@ import ( "math/big" "math/bits" "math/rand" - "runtime" "sync" "testing" "time" @@ -91,8 +90,7 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) - + r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) return r16.Equal(&splitted1) && r16.Equal(&splitted2) @@ -127,8 +125,9 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i, c := range cRange { - innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test of all C + results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -163,8 +162,9 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i, c := range cRange { - innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test for all C + results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -231,7 +231,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 5; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -373,8 +373,7 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) - + r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) return r16.Equal(&splitted1) && r16.Equal(&splitted2) @@ -407,8 +406,9 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i, c := range cRange { - innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test of all C + results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -443,8 +443,9 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i, c := range cRange { - innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test for all C + results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -511,7 +512,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 5; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index dd3d4c2183..9b3140de09 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -128,81 +128,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } } - innerMsmG1(p, int(C), points, scalars, config) - - return p, nil -} - -func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) { - // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + _innerMsmG1(p, C, points, digits, splitFirstChunk) + + return p, nil +} + +func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { + mustBeExt := false switch c { case 4: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] - _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC5] case 6: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC6] case 7: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC7] case 8: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC7] - _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC8] case 9: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] - _innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC10] + } + return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] - _innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC11] + } + return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC12] + } + return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - _innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC13] + } + return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC14] + } + return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] - _innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC15] + } + return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] - _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } + return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: - panic("not implemented") + // panic("will not happen c != previous values is not generated by templates") + return processChunkG1Jacobian[bucketg1JacExtendedC16] } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac { nbChunks := computeNbChunks(c) @@ -219,9 +217,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) + processLastChunk := getChunkProcessorG1(lastC(c)) go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { + processChunk := getChunkProcessorG1(c) go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -229,6 +229,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + processChunk := getChunkProcessorG1(c) if !splitFirstChunk { go processChunk(0, chChunks[0], c, points, digits[:n]) } else { @@ -369,81 +370,79 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } } - innerMsmG2(p, int(C), points, scalars, config) - - return p, nil -} - -func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) { - // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + _innerMsmG2(p, C, points, digits, splitFirstChunk) + + return p, nil +} + +func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { + mustBeExt := false switch c { case 4: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] - _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC5] case 6: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC6] case 7: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC7] case 8: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC7] - _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC8] case 9: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] - _innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC10] + } + return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] - _innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC11] + } + return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC12] + } + return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - _innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC13] + } + return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC14] + } + return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] - _innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC15] + } + return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] - _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } + return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: - panic("not implemented") + // panic("will not happen c != previous values is not generated by templates") + return processChunkG2Jacobian[bucketg2JacExtendedC16] } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac { nbChunks := computeNbChunks(c) @@ -460,9 +459,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) + processLastChunk := getChunkProcessorG2(lastC(c)) go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { + processChunk := getChunkProcessorG2(c) go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -470,6 +471,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + processChunk := getChunkProcessorG2(c) if !splitFirstChunk { go processChunk(0, chChunks[0], c, points, digits[:n]) } else { @@ -530,6 +532,17 @@ func computeNbChunks(c uint64) uint64 { return nbChunks } +// return the last window size for a scalar; if c divides the scalar size +// then it returns c +// if not, returns lastC << c +func lastC(c uint64) uint64 { + const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition + if n%c == 0 { + return c + } + return n - (c * (n / c)) +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index 7413da377f..8f6e9073b8 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -50,64 +50,30 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af } // setup for the batch affine; - // we do that instead of a separate object to give enough hints to the compiler to.. - var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptAdd := 0 // count the number of bucket + point added to current batch - - var R TPP // bucket references - var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var ( + bucketIds BS // bitSet to signify presence of a bucket in current batch + cptAdd int // count the number of bucket + point added to current batch + R TPP // bucket references + P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + queue TQ // queue of points that conflict the current batch + qID int // current position in queue + ) batchSize := len(P) - canAdd := func(bID uint16) bool { - return !bucketIds[bID] - } - isFull := func() bool { - return (cptAdd) == batchSize + return cptAdd == batchSize } executeAndReset := func() { - if (cptAdd) == 0 { - return - } batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) - var tmp BS bucketIds = tmp cptAdd = 0 } - addFromQueue := func(op batchOpG1Affine) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &buckets[op.bucketID] - // handle special cases with inf or -P / P - if BK.IsInfinity() { - BK.Set(&op.point) - return - } - if BK.X.Equal(&op.point.X) { - if BK.Y.Equal(&op.point.Y) { - // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) - return - } - BK.setInfinity() - return - } - - bucketIds[op.bucketID] = true - R[cptAdd] = BK - P[cptAdd] = op.point - cptAdd++ - } - add := func(bucketID uint16, PP *G1Affine, isAdd bool) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - + // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { @@ -149,15 +115,12 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af cptAdd++ } - var queue TQ - qID := 0 - processQueue := func() { for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { + if bucketIds[queue[i].bucketID] { continue } - addFromQueue(queue[i]) + add(queue[i].bucketID, &queue[i].point, true) if isFull() { executeAndReset() } @@ -166,19 +129,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af } } - processTopQueue := func() { - for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { - return - } - addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - qID-- - } - } - for i, digit := range digits { if digit == 0 || points[i].IsInfinity() { @@ -192,7 +142,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af bucketID -= 1 } - if !canAdd(bucketID) { + if bucketIds[bucketID] { // put it in queue queue[qID].bucketID = bucketID if isAdd { @@ -214,7 +164,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processTopQueue() } } @@ -380,64 +329,30 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af } // setup for the batch affine; - // we do that instead of a separate object to give enough hints to the compiler to.. - var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptAdd := 0 // count the number of bucket + point added to current batch - - var R TPP // bucket references - var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var ( + bucketIds BS // bitSet to signify presence of a bucket in current batch + cptAdd int // count the number of bucket + point added to current batch + R TPP // bucket references + P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + queue TQ // queue of points that conflict the current batch + qID int // current position in queue + ) batchSize := len(P) - canAdd := func(bID uint16) bool { - return !bucketIds[bID] - } - isFull := func() bool { - return (cptAdd) == batchSize + return cptAdd == batchSize } executeAndReset := func() { - if (cptAdd) == 0 { - return - } batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) - var tmp BS bucketIds = tmp cptAdd = 0 } - addFromQueue := func(op batchOpG2Affine) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &buckets[op.bucketID] - // handle special cases with inf or -P / P - if BK.IsInfinity() { - BK.Set(&op.point) - return - } - if BK.X.Equal(&op.point.X) { - if BK.Y.Equal(&op.point.Y) { - // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) - return - } - BK.setInfinity() - return - } - - bucketIds[op.bucketID] = true - R[cptAdd] = BK - P[cptAdd] = op.point - cptAdd++ - } - add := func(bucketID uint16, PP *G2Affine, isAdd bool) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - + // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { @@ -479,15 +394,12 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af cptAdd++ } - var queue TQ - qID := 0 - processQueue := func() { for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { + if bucketIds[queue[i].bucketID] { continue } - addFromQueue(queue[i]) + add(queue[i].bucketID, &queue[i].point, true) if isFull() { executeAndReset() } @@ -496,19 +408,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af } } - processTopQueue := func() { - for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { - return - } - addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - qID-- - } - } - for i, digit := range digits { if digit == 0 || points[i].IsInfinity() { @@ -522,7 +421,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af bucketID -= 1 } - if !canAdd(bucketID) { + if bucketIds[bucketID] { // put it in queue queue[qID].bucketID = bucketID if isAdd { @@ -544,7 +443,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processTopQueue() } } diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index 23dc3b5897..5962e0b859 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -21,7 +21,6 @@ import ( "math/big" "math/bits" "math/rand" - "runtime" "sync" "testing" "time" @@ -91,8 +90,7 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) - + r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) return r16.Equal(&splitted1) && r16.Equal(&splitted2) @@ -127,8 +125,9 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i, c := range cRange { - innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test of all C + results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -163,8 +162,9 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i, c := range cRange { - innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test for all C + results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -231,7 +231,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 5; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -373,8 +373,7 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) - + r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) return r16.Equal(&splitted1) && r16.Equal(&splitted2) @@ -407,8 +406,9 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i, c := range cRange { - innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test of all C + results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -443,8 +443,9 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i, c := range cRange { - innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test for all C + results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -511,7 +512,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 5; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index b16ddb3d18..71170c6cf9 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -128,46 +128,43 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } } - innerMsmG1(p, int(C), points, scalars, config) - - return p, nil -} - -func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) { - // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + _innerMsmG1(p, C, points, digits, splitFirstChunk) + + return p, nil +} + +func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { + mustBeExt := false switch c { case 4: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1] - _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC5] case 8: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC8] case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] - _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } + return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: - panic("not implemented") + // panic("will not happen c != previous values is not generated by templates") + return processChunkG1Jacobian[bucketg1JacExtendedC16] } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac { nbChunks := computeNbChunks(c) @@ -184,9 +181,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) + processLastChunk := getChunkProcessorG1(lastC(c)) go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { + processChunk := getChunkProcessorG1(c) go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -194,6 +193,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + processChunk := getChunkProcessorG1(c) if !splitFirstChunk { go processChunk(0, chChunks[0], c, points, digits[:n]) } else { @@ -334,46 +334,43 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } } - innerMsmG2(p, int(C), points, scalars, config) - - return p, nil -} - -func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) { - // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + _innerMsmG2(p, C, points, digits, splitFirstChunk) + + return p, nil +} + +func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { + mustBeExt := false switch c { case 4: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1] - _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC5] case 8: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC8] case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] - _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } + return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: - panic("not implemented") + // panic("will not happen c != previous values is not generated by templates") + return processChunkG2Jacobian[bucketg2JacExtendedC16] } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac { nbChunks := computeNbChunks(c) @@ -390,9 +387,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) + processLastChunk := getChunkProcessorG2(lastC(c)) go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { + processChunk := getChunkProcessorG2(c) go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -400,6 +399,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + processChunk := getChunkProcessorG2(c) if !splitFirstChunk { go processChunk(0, chChunks[0], c, points, digits[:n]) } else { @@ -460,6 +460,17 @@ func computeNbChunks(c uint64) uint64 { return nbChunks } +// return the last window size for a scalar; if c divides the scalar size +// then it returns c +// if not, returns lastC << c +func lastC(c uint64) uint64 { + const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition + if n%c == 0 { + return c + } + return n - (c * (n / c)) +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index f25c59a8b6..870483e934 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -49,64 +49,30 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af } // setup for the batch affine; - // we do that instead of a separate object to give enough hints to the compiler to.. - var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptAdd := 0 // count the number of bucket + point added to current batch - - var R TPP // bucket references - var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var ( + bucketIds BS // bitSet to signify presence of a bucket in current batch + cptAdd int // count the number of bucket + point added to current batch + R TPP // bucket references + P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + queue TQ // queue of points that conflict the current batch + qID int // current position in queue + ) batchSize := len(P) - canAdd := func(bID uint16) bool { - return !bucketIds[bID] - } - isFull := func() bool { - return (cptAdd) == batchSize + return cptAdd == batchSize } executeAndReset := func() { - if (cptAdd) == 0 { - return - } batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) - var tmp BS bucketIds = tmp cptAdd = 0 } - addFromQueue := func(op batchOpG1Affine) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &buckets[op.bucketID] - // handle special cases with inf or -P / P - if BK.IsInfinity() { - BK.Set(&op.point) - return - } - if BK.X.Equal(&op.point.X) { - if BK.Y.Equal(&op.point.Y) { - // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) - return - } - BK.setInfinity() - return - } - - bucketIds[op.bucketID] = true - R[cptAdd] = BK - P[cptAdd] = op.point - cptAdd++ - } - add := func(bucketID uint16, PP *G1Affine, isAdd bool) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - + // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { @@ -148,15 +114,12 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af cptAdd++ } - var queue TQ - qID := 0 - processQueue := func() { for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { + if bucketIds[queue[i].bucketID] { continue } - addFromQueue(queue[i]) + add(queue[i].bucketID, &queue[i].point, true) if isFull() { executeAndReset() } @@ -165,19 +128,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af } } - processTopQueue := func() { - for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { - return - } - addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - qID-- - } - } - for i, digit := range digits { if digit == 0 || points[i].IsInfinity() { @@ -191,7 +141,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af bucketID -= 1 } - if !canAdd(bucketID) { + if bucketIds[bucketID] { // put it in queue queue[qID].bucketID = bucketID if isAdd { @@ -213,7 +163,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processTopQueue() } } @@ -307,64 +256,30 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af } // setup for the batch affine; - // we do that instead of a separate object to give enough hints to the compiler to.. - var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptAdd := 0 // count the number of bucket + point added to current batch - - var R TPP // bucket references - var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var ( + bucketIds BS // bitSet to signify presence of a bucket in current batch + cptAdd int // count the number of bucket + point added to current batch + R TPP // bucket references + P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + queue TQ // queue of points that conflict the current batch + qID int // current position in queue + ) batchSize := len(P) - canAdd := func(bID uint16) bool { - return !bucketIds[bID] - } - isFull := func() bool { - return (cptAdd) == batchSize + return cptAdd == batchSize } executeAndReset := func() { - if (cptAdd) == 0 { - return - } batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) - var tmp BS bucketIds = tmp cptAdd = 0 } - addFromQueue := func(op batchOpG2Affine) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &buckets[op.bucketID] - // handle special cases with inf or -P / P - if BK.IsInfinity() { - BK.Set(&op.point) - return - } - if BK.X.Equal(&op.point.X) { - if BK.Y.Equal(&op.point.Y) { - // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) - return - } - BK.setInfinity() - return - } - - bucketIds[op.bucketID] = true - R[cptAdd] = BK - P[cptAdd] = op.point - cptAdd++ - } - add := func(bucketID uint16, PP *G2Affine, isAdd bool) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - + // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { @@ -406,15 +321,12 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af cptAdd++ } - var queue TQ - qID := 0 - processQueue := func() { for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { + if bucketIds[queue[i].bucketID] { continue } - addFromQueue(queue[i]) + add(queue[i].bucketID, &queue[i].point, true) if isFull() { executeAndReset() } @@ -423,19 +335,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af } } - processTopQueue := func() { - for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { - return - } - addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - qID-- - } - } - for i, digit := range digits { if digit == 0 || points[i].IsInfinity() { @@ -449,7 +348,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af bucketID -= 1 } - if !canAdd(bucketID) { + if bucketIds[bucketID] { // put it in queue queue[qID].bucketID = bucketID if isAdd { @@ -471,7 +370,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processTopQueue() } } diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index 4c40debed6..45bef3125a 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -21,7 +21,6 @@ import ( "math/big" "math/bits" "math/rand" - "runtime" "sync" "testing" "time" @@ -91,8 +90,7 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) - + r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) return r16.Equal(&splitted1) && r16.Equal(&splitted2) @@ -127,8 +125,9 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i, c := range cRange { - innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test of all C + results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -163,8 +162,9 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i, c := range cRange { - innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test for all C + results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -231,7 +231,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 5; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -373,8 +373,7 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) - + r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) return r16.Equal(&splitted1) && r16.Equal(&splitted2) @@ -407,8 +406,9 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i, c := range cRange { - innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test of all C + results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -443,8 +443,9 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i, c := range cRange { - innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test for all C + results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -511,7 +512,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 5; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index be86f40a8f..28672b9219 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -128,47 +128,43 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } } - innerMsmG1(p, int(C), points, scalars, config) - - return p, nil -} - -func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) { - // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + _innerMsmG1(p, C, points, digits, splitFirstChunk) + + return p, nil +} + +func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { + mustBeExt := false switch c { case 4: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC5] case 8: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC8] case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] - _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } + return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: - panic("not implemented") + // panic("will not happen c != previous values is not generated by templates") + return processChunkG1Jacobian[bucketg1JacExtendedC16] } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac { nbChunks := computeNbChunks(c) @@ -185,9 +181,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) + processLastChunk := getChunkProcessorG1(lastC(c)) go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { + processChunk := getChunkProcessorG1(c) go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -195,6 +193,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + processChunk := getChunkProcessorG1(c) if !splitFirstChunk { go processChunk(0, chChunks[0], c, points, digits[:n]) } else { @@ -335,47 +334,43 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } } - innerMsmG2(p, int(C), points, scalars, config) - - return p, nil -} - -func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) { - // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + _innerMsmG2(p, C, points, digits, splitFirstChunk) + + return p, nil +} + +func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { + mustBeExt := false switch c { case 4: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC5] case 8: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC8] case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] - _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } + return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: - panic("not implemented") + // panic("will not happen c != previous values is not generated by templates") + return processChunkG2Jacobian[bucketg2JacExtendedC16] } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac { nbChunks := computeNbChunks(c) @@ -392,9 +387,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) + processLastChunk := getChunkProcessorG2(lastC(c)) go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { + processChunk := getChunkProcessorG2(c) go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -402,6 +399,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + processChunk := getChunkProcessorG2(c) if !splitFirstChunk { go processChunk(0, chChunks[0], c, points, digits[:n]) } else { @@ -462,6 +460,17 @@ func computeNbChunks(c uint64) uint64 { return nbChunks } +// return the last window size for a scalar; if c divides the scalar size +// then it returns c +// if not, returns lastC << c +func lastC(c uint64) uint64 { + const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition + if n%c == 0 { + return c + } + return n - (c * (n / c)) +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index 419acf811c..ca6e7c172a 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -49,64 +49,30 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af } // setup for the batch affine; - // we do that instead of a separate object to give enough hints to the compiler to.. - var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptAdd := 0 // count the number of bucket + point added to current batch - - var R TPP // bucket references - var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var ( + bucketIds BS // bitSet to signify presence of a bucket in current batch + cptAdd int // count the number of bucket + point added to current batch + R TPP // bucket references + P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + queue TQ // queue of points that conflict the current batch + qID int // current position in queue + ) batchSize := len(P) - canAdd := func(bID uint16) bool { - return !bucketIds[bID] - } - isFull := func() bool { - return (cptAdd) == batchSize + return cptAdd == batchSize } executeAndReset := func() { - if (cptAdd) == 0 { - return - } batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) - var tmp BS bucketIds = tmp cptAdd = 0 } - addFromQueue := func(op batchOpG1Affine) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &buckets[op.bucketID] - // handle special cases with inf or -P / P - if BK.IsInfinity() { - BK.Set(&op.point) - return - } - if BK.X.Equal(&op.point.X) { - if BK.Y.Equal(&op.point.Y) { - // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) - return - } - BK.setInfinity() - return - } - - bucketIds[op.bucketID] = true - R[cptAdd] = BK - P[cptAdd] = op.point - cptAdd++ - } - add := func(bucketID uint16, PP *G1Affine, isAdd bool) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - + // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { @@ -148,15 +114,12 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af cptAdd++ } - var queue TQ - qID := 0 - processQueue := func() { for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { + if bucketIds[queue[i].bucketID] { continue } - addFromQueue(queue[i]) + add(queue[i].bucketID, &queue[i].point, true) if isFull() { executeAndReset() } @@ -165,19 +128,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af } } - processTopQueue := func() { - for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { - return - } - addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - qID-- - } - } - for i, digit := range digits { if digit == 0 || points[i].IsInfinity() { @@ -191,7 +141,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af bucketID -= 1 } - if !canAdd(bucketID) { + if bucketIds[bucketID] { // put it in queue queue[qID].bucketID = bucketID if isAdd { @@ -213,7 +163,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processTopQueue() } } @@ -307,64 +256,30 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af } // setup for the batch affine; - // we do that instead of a separate object to give enough hints to the compiler to.. - var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptAdd := 0 // count the number of bucket + point added to current batch - - var R TPP // bucket references - var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var ( + bucketIds BS // bitSet to signify presence of a bucket in current batch + cptAdd int // count the number of bucket + point added to current batch + R TPP // bucket references + P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + queue TQ // queue of points that conflict the current batch + qID int // current position in queue + ) batchSize := len(P) - canAdd := func(bID uint16) bool { - return !bucketIds[bID] - } - isFull := func() bool { - return (cptAdd) == batchSize + return cptAdd == batchSize } executeAndReset := func() { - if (cptAdd) == 0 { - return - } batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) - var tmp BS bucketIds = tmp cptAdd = 0 } - addFromQueue := func(op batchOpG2Affine) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &buckets[op.bucketID] - // handle special cases with inf or -P / P - if BK.IsInfinity() { - BK.Set(&op.point) - return - } - if BK.X.Equal(&op.point.X) { - if BK.Y.Equal(&op.point.Y) { - // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) - return - } - BK.setInfinity() - return - } - - bucketIds[op.bucketID] = true - R[cptAdd] = BK - P[cptAdd] = op.point - cptAdd++ - } - add := func(bucketID uint16, PP *G2Affine, isAdd bool) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - + // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { @@ -406,15 +321,12 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af cptAdd++ } - var queue TQ - qID := 0 - processQueue := func() { for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { + if bucketIds[queue[i].bucketID] { continue } - addFromQueue(queue[i]) + add(queue[i].bucketID, &queue[i].point, true) if isFull() { executeAndReset() } @@ -423,19 +335,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af } } - processTopQueue := func() { - for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { - return - } - addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - qID-- - } - } - for i, digit := range digits { if digit == 0 || points[i].IsInfinity() { @@ -449,7 +348,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af bucketID -= 1 } - if !canAdd(bucketID) { + if bucketIds[bucketID] { // put it in queue queue[qID].bucketID = bucketID if isAdd { @@ -471,7 +370,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processTopQueue() } } diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index d79044f69c..57956e9c5a 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -21,7 +21,6 @@ import ( "math/big" "math/bits" "math/rand" - "runtime" "sync" "testing" "time" @@ -91,8 +90,7 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) - + r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) return r16.Equal(&splitted1) && r16.Equal(&splitted2) @@ -127,8 +125,9 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i, c := range cRange { - innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test of all C + results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -163,8 +162,9 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i, c := range cRange { - innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test for all C + results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -231,7 +231,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 5; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -373,8 +373,7 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) - + r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) return r16.Equal(&splitted1) && r16.Equal(&splitted2) @@ -407,8 +406,9 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i, c := range cRange { - innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test of all C + results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -443,8 +443,9 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i, c := range cRange { - innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test for all C + results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -511,7 +512,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 5; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index 7272a5efef..cc02ad9b57 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -128,47 +128,43 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } } - innerMsmG1(p, int(C), points, scalars, config) - - return p, nil -} - -func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) { - // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + _innerMsmG1(p, C, points, digits, splitFirstChunk) + + return p, nil +} + +func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { + mustBeExt := false switch c { case 4: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] - _innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3] - _innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC5] case 8: - processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8] - processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2] - _innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG1Jacobian[bucketg1JacExtendedC8] case 16: - processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] - processLastChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] - _innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } + return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: - panic("not implemented") + // panic("will not happen c != previous values is not generated by templates") + return processChunkG1Jacobian[bucketg1JacExtendedC16] } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac { nbChunks := computeNbChunks(c) @@ -185,9 +181,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) + processLastChunk := getChunkProcessorG1(lastC(c)) go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { + processChunk := getChunkProcessorG1(c) go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -195,6 +193,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + processChunk := getChunkProcessorG1(c) if !splitFirstChunk { go processChunk(0, chChunks[0], c, points, digits[:n]) } else { @@ -335,47 +334,43 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } } - innerMsmG2(p, int(C), points, scalars, config) - - return p, nil -} - -func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) { - // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + _innerMsmG2(p, C, points, digits, splitFirstChunk) + + return p, nil +} + +func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { + mustBeExt := false switch c { case 4: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] - _innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3] - _innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC5] case 8: - processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8] - processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2] - _innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk) + return processChunkG2Jacobian[bucketg2JacExtendedC8] case 16: - processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] - processLastChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] - _innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk) + if mustBeExt { + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } + return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: - panic("not implemented") + // panic("will not happen c != previous values is not generated by templates") + return processChunkG2Jacobian[bucketg2JacExtendedC16] } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac { nbChunks := computeNbChunks(c) @@ -392,9 +387,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) + processLastChunk := getChunkProcessorG2(lastC(c)) go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j > 0; j-- { + processChunk := getChunkProcessorG2(c) go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -402,6 +399,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + processChunk := getChunkProcessorG2(c) if !splitFirstChunk { go processChunk(0, chChunks[0], c, points, digits[:n]) } else { @@ -462,6 +460,17 @@ func computeNbChunks(c uint64) uint64 { return nbChunks } +// return the last window size for a scalar; if c divides the scalar size +// then it returns c +// if not, returns lastC << c +func lastC(c uint64) uint64 { + const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition + if n%c == 0 { + return c + } + return n - (c * (n / c)) +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index 4f039c26bc..3b653b6563 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -49,64 +49,30 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af } // setup for the batch affine; - // we do that instead of a separate object to give enough hints to the compiler to.. - var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptAdd := 0 // count the number of bucket + point added to current batch - - var R TPP // bucket references - var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var ( + bucketIds BS // bitSet to signify presence of a bucket in current batch + cptAdd int // count the number of bucket + point added to current batch + R TPP // bucket references + P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + queue TQ // queue of points that conflict the current batch + qID int // current position in queue + ) batchSize := len(P) - canAdd := func(bID uint16) bool { - return !bucketIds[bID] - } - isFull := func() bool { - return (cptAdd) == batchSize + return cptAdd == batchSize } executeAndReset := func() { - if (cptAdd) == 0 { - return - } batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) - var tmp BS bucketIds = tmp cptAdd = 0 } - addFromQueue := func(op batchOpG1Affine) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &buckets[op.bucketID] - // handle special cases with inf or -P / P - if BK.IsInfinity() { - BK.Set(&op.point) - return - } - if BK.X.Equal(&op.point.X) { - if BK.Y.Equal(&op.point.Y) { - // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) - return - } - BK.setInfinity() - return - } - - bucketIds[op.bucketID] = true - R[cptAdd] = BK - P[cptAdd] = op.point - cptAdd++ - } - add := func(bucketID uint16, PP *G1Affine, isAdd bool) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - + // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { @@ -148,15 +114,12 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af cptAdd++ } - var queue TQ - qID := 0 - processQueue := func() { for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { + if bucketIds[queue[i].bucketID] { continue } - addFromQueue(queue[i]) + add(queue[i].bucketID, &queue[i].point, true) if isFull() { executeAndReset() } @@ -165,19 +128,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af } } - processTopQueue := func() { - for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { - return - } - addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - qID-- - } - } - for i, digit := range digits { if digit == 0 || points[i].IsInfinity() { @@ -191,7 +141,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af bucketID -= 1 } - if !canAdd(bucketID) { + if bucketIds[bucketID] { // put it in queue queue[qID].bucketID = bucketID if isAdd { @@ -213,7 +163,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processTopQueue() } } @@ -307,64 +256,30 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af } // setup for the batch affine; - // we do that instead of a separate object to give enough hints to the compiler to.. - var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptAdd := 0 // count the number of bucket + point added to current batch - - var R TPP // bucket references - var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var ( + bucketIds BS // bitSet to signify presence of a bucket in current batch + cptAdd int // count the number of bucket + point added to current batch + R TPP // bucket references + P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + queue TQ // queue of points that conflict the current batch + qID int // current position in queue + ) batchSize := len(P) - canAdd := func(bID uint16) bool { - return !bucketIds[bID] - } - isFull := func() bool { - return (cptAdd) == batchSize + return cptAdd == batchSize } executeAndReset := func() { - if (cptAdd) == 0 { - return - } batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) - var tmp BS bucketIds = tmp cptAdd = 0 } - addFromQueue := func(op batchOpG2Affine) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &buckets[op.bucketID] - // handle special cases with inf or -P / P - if BK.IsInfinity() { - BK.Set(&op.point) - return - } - if BK.X.Equal(&op.point.X) { - if BK.Y.Equal(&op.point.Y) { - // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) - return - } - BK.setInfinity() - return - } - - bucketIds[op.bucketID] = true - R[cptAdd] = BK - P[cptAdd] = op.point - cptAdd++ - } - add := func(bucketID uint16, PP *G2Affine, isAdd bool) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - + // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { @@ -406,15 +321,12 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af cptAdd++ } - var queue TQ - qID := 0 - processQueue := func() { for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { + if bucketIds[queue[i].bucketID] { continue } - addFromQueue(queue[i]) + add(queue[i].bucketID, &queue[i].point, true) if isFull() { executeAndReset() } @@ -423,19 +335,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af } } - processTopQueue := func() { - for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { - return - } - addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - qID-- - } - } - for i, digit := range digits { if digit == 0 || points[i].IsInfinity() { @@ -449,7 +348,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af bucketID -= 1 } - if !canAdd(bucketID) { + if bucketIds[bucketID] { // put it in queue queue[qID].bucketID = bucketID if isAdd { @@ -471,7 +370,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processTopQueue() } } diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index 2dcc22a913..968613803a 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -21,7 +21,6 @@ import ( "math/big" "math/bits" "math/rand" - "runtime" "sync" "testing" "time" @@ -91,8 +90,7 @@ func TestMultiExpG1(t *testing.T) { FromMont() } - innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) - + r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) return r16.Equal(&splitted1) && r16.Equal(&splitted2) @@ -127,8 +125,9 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i, c := range cRange { - innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test of all C + results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -163,8 +162,9 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i, c := range cRange { - innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test for all C + results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -231,7 +231,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 5; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -373,8 +373,7 @@ func TestMultiExpG2(t *testing.T) { FromMont() } - innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) - + r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) return r16.Equal(&splitted1) && r16.Equal(&splitted2) @@ -407,8 +406,9 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i, c := range cRange { - innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test of all C + results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -443,8 +443,9 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i, c := range cRange { - innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + for i := range cRange { + // TODO @gbotrel restore test for all C + results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -511,7 +512,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 5; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index 7066a005af..ef2b67b636 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -43,6 +43,18 @@ func computeNbChunks(c uint64) uint64 { return nbChunks } +// return the last window size for a scalar; if c divides the scalar size +// then it returns c +// if not, returns lastC << c +func lastC(c uint64) uint64 { + const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition + if n%c == 0 { + return c + } + return n - (c * (n / c)) +} + + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -399,53 +411,43 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem } } - innerMsm{{ $.UPointName }}(p, int(C), points, scalars, config) - - return p, nil -} - - -func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig) { - // partition the scalars // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window // var smallValues int - digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks) + digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) // if we have more than 10% of small values, we split the processing of the first chunk in 2 // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - {{- /* TODO @gbotrel need to deal with cases where lastC == 1 ; having a whole chunk with 1-bit window makes no sense */}} - {{- /* also need to determine until which window size the ext-jacobian version is worth it. */}} + _innerMsm{{ $.UPointName }}(p, C, points, digits, splitFirstChunk) + + return p, nil +} + + +func getChunkProcessor{{ $.UPointName }}(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16) { + mustBeExt := false switch c { - {{range $c := $.CRange}} - {{- $lc := lastC $c}} - case {{$c}}: - {{- if le $c 9}} - processChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] - {{- else}} - processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}] - {{- end}} - {{- if eq $c $lc}} - _innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processChunk) - {{- else}} - {{- if le $lc 9}} - processLastChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{lastC $c}}] + {{range $c := $.CRange}} + case {{$c}}: + {{- if le $c 9}} + return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] {{- else}} - processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}, bitSetC{{lastC $c}}, p{{$.TAffine}}C{{lastC $c}}, pp{{$.TAffine}}C{{lastC $c}}, q{{$.TAffine}}C{{lastC $c}}, c{{$.TAffine}}C{{lastC $c}}] + if mustBeExt { + return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] + } + return processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}] {{- end}} - _innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processLastChunk) {{- end}} - {{- end}} - default: - panic("not implemented") + default: + // panic("will not happen c != previous values is not generated by templates") + return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C16] } } -func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint16, splitFirstChunk bool, - processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16)) *{{ $.TJacobian }} { +func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint16, splitFirstChunk bool) *{{ $.TJacobian }} { nbChunks := computeNbChunks(c) @@ -462,9 +464,11 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) + processLastChunk := getChunkProcessor{{ $.UPointName }}(lastC(c)) go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) for j := int(nbChunks - 2); j >0; j-- { + processChunk := getChunkProcessor{{ $.UPointName }}(c) go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -472,6 +476,7 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed // in the ~same amount of time if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + processChunk := getChunkProcessor{{ $.UPointName }}(c) if !splitFirstChunk { go processChunk(0,chChunks[0], c, points, digits[:n]) } else { diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index 312ea0897f..8462321866 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -55,67 +55,32 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T } // setup for the batch affine; - // we do that instead of a separate object to give enough hints to the compiler to.. - var bucketIds BS // bitSet to signify presence of a bucket in current batch - cptAdd := 0 // count the number of bucket + point added to current batch - - - var R TPP // bucket references - var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + var ( + bucketIds BS // bitSet to signify presence of a bucket in current batch + cptAdd int // count the number of bucket + point added to current batch + R TPP // bucket references + P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy) + queue TQ // queue of points that conflict the current batch + qID int // current position in queue + ) batchSize := len(P) - canAdd := func(bID uint16) bool { - return !bucketIds[bID] - } - isFull := func() bool { - return (cptAdd) == batchSize + return cptAdd == batchSize } executeAndReset := func () { - if (cptAdd) == 0 { - return - } batchAdd{{ $.TAffine }}[TP, TPP, TC](&R, &P, cptAdd) - var tmp BS bucketIds = tmp cptAdd = 0 } - addFromQueue := func(op batchOp{{$.TAffine}}) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - - BK := &buckets[op.bucketID] - // handle special cases with inf or -P / P - if BK.IsInfinity() { - BK.Set(&op.point) - return - } - if BK.X.Equal(&op.point.X) { - if BK.Y.Equal(&op.point.Y) { - // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) - return - } - BK.setInfinity() - return - } - - - bucketIds[op.bucketID] = true - R[cptAdd] = BK - P[cptAdd] = op.point - cptAdd++ - } add := func(bucketID uint16, PP *{{$.TAffine}}, isAdd bool) { - // CanAdd must be called before --> ensures bucket is not "used" in current batch - + // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P if BK.IsInfinity() { @@ -147,7 +112,6 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T return } - bucketIds[bucketID] = true R[cptAdd] = BK if isAdd { @@ -157,17 +121,14 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T } cptAdd++ } - - var queue TQ - qID := 0 processQueue := func () { for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { + if bucketIds[queue[i].bucketID] { continue } - addFromQueue(queue[i]) + add(queue[i].bucketID, &queue[i].point, true) if isFull() { executeAndReset() } @@ -176,18 +137,6 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T } } - processTopQueue := func() { - for i := qID - 1; i >= 0; i-- { - if !canAdd(queue[i].bucketID) { - return - } - addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - qID-- - } - } for i, digit := range digits { @@ -202,7 +151,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T bucketID-=1 } - if !canAdd(bucketID) { + if bucketIds[bucketID] { // put it in queue queue[qID].bucketID = bucketID if isAdd { @@ -224,7 +173,6 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processTopQueue() } } diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index 070481bf7b..455142ceb6 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -10,7 +10,6 @@ import ( "fmt" "time" - "runtime" "math/rand" "math/big" "testing" @@ -90,8 +89,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { FromMont() } - innerMsm{{ toUpper $.PointName }}(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks:runtime.NumCPU()}) - + r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{}) splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) return r16.Equal(&splitted1) && r16.Equal(&splitted2) @@ -135,8 +133,9 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { results := make([]{{ $.TJacobian }}, len(cRange)) - for i, c := range cRange { - innerMsm{{ toUpper $.PointName }}(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks:runtime.NumCPU()}) + for i, _ := range cRange { + // TODO @gbotrel restore test of all C + results[i].MultiExp( samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i:=1; i < len(results);i++ { if !results[i].Equal(&results[i-1]) { @@ -171,8 +170,9 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { } results := make([]{{ $.TJacobian }}, len(cRange)) - for i, c := range cRange { - innerMsm{{ toUpper $.PointName }}(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks:runtime.NumCPU()}) + for i, _ := range cRange { + // TODO @gbotrel restore test for all C + results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -245,7 +245,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { var testPoint {{ $.TAffine }} - for i := 5; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { From 555ca0d4e0580b290e9e06325bcb339a86fe8bbb Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 15 Nov 2022 11:40:57 -0600 Subject: [PATCH 24/43] feat: added chunkStats instead of small values --- ecc/bls12-377/multiexp.go | 158 ++++++++---------- ecc/bls12-378/multiexp.go | 158 ++++++++---------- ecc/bls12-381/multiexp.go | 158 ++++++++---------- ecc/bls24-315/multiexp.go | 158 ++++++++---------- ecc/bls24-317/multiexp.go | 158 ++++++++---------- ecc/bn254/multiexp.go | 158 ++++++++---------- ecc/bw6-633/multiexp.go | 158 ++++++++---------- ecc/bw6-756/multiexp.go | 158 ++++++++---------- ecc/bw6-761/multiexp.go | 158 ++++++++---------- .../generator/ecc/template/multiexp.go.tmpl | 115 ++++++------- 10 files changed, 645 insertions(+), 892 deletions(-) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index cf970c5246..b04f523cff 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -129,16 +129,9 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - _innerMsmG1(p, C, points, digits, splitFirstChunk) + _innerMsmG1(p, C, points, digits, chunkStats) return p, nil } @@ -200,7 +193,7 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac { nbChunks := computeNbChunks(c) @@ -217,35 +210,29 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - processLastChunk := getChunkProcessorG1(lastC(c)) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) - for j := int(nbChunks - 2); j > 0; j-- { + for j := int(nbChunks - 1); j >= 0; j-- { processChunk := getChunkProcessorG1(c) - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] - // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed - // in the ~same amount of time - if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. - processChunk := getChunkProcessorG1(c) - if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, digits[:n]) - } else { + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG1(lastC(c)) + } + if chunkStats[j].weight >= 150.0 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], digits[:split]) - go processChunk(0, chSplit, c, points[split:], digits[split:n]) + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) go func() { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[0] <- s1 + chChunks[j] <- s1 }() + continue } - + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) @@ -371,16 +358,9 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - _innerMsmG2(p, C, points, digits, splitFirstChunk) + _innerMsmG2(p, C, points, digits, chunkStats) return p, nil } @@ -442,7 +422,7 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac { nbChunks := computeNbChunks(c) @@ -459,35 +439,29 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - processLastChunk := getChunkProcessorG2(lastC(c)) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) - for j := int(nbChunks - 2); j > 0; j-- { + for j := int(nbChunks - 1); j >= 0; j-- { processChunk := getChunkProcessorG2(c) - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] - // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed - // in the ~same amount of time - if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. - processChunk := getChunkProcessorG2(c) - if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, digits[:n]) - } else { + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG2(lastC(c)) + } + if chunkStats[j].weight >= 150.0 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], digits[:split]) - go processChunk(0, chSplit, c, points[split:], digits[split:n]) + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) go func() { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[0] <- s1 + chChunks[j] <- s1 }() + continue } - + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } return msmReduceChunkG2Affine(p, int(c), chChunks[:]) @@ -543,24 +517,25 @@ func lastC(c uint64) uint64 { return n - (c * (n / c)) } +type chunkStat struct { + weight float32 // relative weight compared to other chunks. +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) // scalarsMont indicates wheter the provided scalars are in montgomery form -// returns smallValues, which represent the number of scalars which meets the following condition -// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) { // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) - toReturn := make([]uint16, len(scalars)*int(nbChunks)) + digits := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 - // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window - max := int(1 << (c - 1)) // max value we want for our digits - cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) @@ -579,36 +554,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - // for each chunk, we could track the number of non-zeros points we will need to process - // this way, if a chunk has more work to do than others, we can spawn off more go routines - // (at the cost of more buckets allocated) - // a simplified approach is to track the small values where only the first word is set - // if this number represent a significant number of points, then we will split first chunk - // processing in the msm in 2, to ensure all go routines finish at ~same time - // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine - // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) + chOpsPerChunk := make(chan []int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { - smallValues := 0 + opsPerChunk := make([]int, nbChunks) for i := start; i < end; i++ { - var carry int - scalar := scalars[i] if scalarsMont { scalar.FromMont() } - if scalar.FitsOnOneWord() { + if scalar.IsZero() { // everything is 0, no need to process this scalar - if scalar[0] == 0 { - continue - } - // low c-bits are 1 in mask - if scalar[0]&mask == scalar[0] { - smallValues++ - } + continue } + var carry int + // for each chunk in the scalar, compute the current digit, and an eventual carry for chunk := uint64(0); chunk < nbChunks; chunk++ { s := selectors[chunk] @@ -637,26 +598,39 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // if digit is zero, no impact on result if digit == 0 { continue - } else if digit > 0 { + } + if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } - toReturn[int(chunk)*len(scalars)+i] = bits + digits[int(chunk)*len(scalars)+i] = bits + opsPerChunk[chunk]++ } } - chSmallValues <- smallValues + chOpsPerChunk <- opsPerChunk }, nbTasks) - // aggregate small values - close(chSmallValues) - smallValues := 0 - for o := range chSmallValues { - smallValues += o + // aggregate chunk stats + close(chOpsPerChunk) + opsPerChunk := make([]int, nbChunks) + totalOps := 0 + for o := range chOpsPerChunk { + for i, nbOps := range o { + opsPerChunk[i] += nbOps + totalOps += nbOps + } } - return toReturn, smallValues + chunkStats := make([]chunkStat, nbChunks) + target := float32(totalOps) / float32(nbChunks) + // what percentage are you of the target + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + } + + return digits, chunkStats } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index 91f7ede410..cd49deb3de 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -129,16 +129,9 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - _innerMsmG1(p, C, points, digits, splitFirstChunk) + _innerMsmG1(p, C, points, digits, chunkStats) return p, nil } @@ -200,7 +193,7 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac { nbChunks := computeNbChunks(c) @@ -217,35 +210,29 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - processLastChunk := getChunkProcessorG1(lastC(c)) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) - for j := int(nbChunks - 2); j > 0; j-- { + for j := int(nbChunks - 1); j >= 0; j-- { processChunk := getChunkProcessorG1(c) - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] - // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed - // in the ~same amount of time - if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. - processChunk := getChunkProcessorG1(c) - if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, digits[:n]) - } else { + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG1(lastC(c)) + } + if chunkStats[j].weight >= 150.0 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], digits[:split]) - go processChunk(0, chSplit, c, points[split:], digits[split:n]) + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) go func() { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[0] <- s1 + chChunks[j] <- s1 }() + continue } - + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) @@ -371,16 +358,9 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - _innerMsmG2(p, C, points, digits, splitFirstChunk) + _innerMsmG2(p, C, points, digits, chunkStats) return p, nil } @@ -442,7 +422,7 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac { nbChunks := computeNbChunks(c) @@ -459,35 +439,29 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - processLastChunk := getChunkProcessorG2(lastC(c)) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) - for j := int(nbChunks - 2); j > 0; j-- { + for j := int(nbChunks - 1); j >= 0; j-- { processChunk := getChunkProcessorG2(c) - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] - // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed - // in the ~same amount of time - if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. - processChunk := getChunkProcessorG2(c) - if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, digits[:n]) - } else { + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG2(lastC(c)) + } + if chunkStats[j].weight >= 150.0 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], digits[:split]) - go processChunk(0, chSplit, c, points[split:], digits[split:n]) + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) go func() { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[0] <- s1 + chChunks[j] <- s1 }() + continue } - + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } return msmReduceChunkG2Affine(p, int(c), chChunks[:]) @@ -543,24 +517,25 @@ func lastC(c uint64) uint64 { return n - (c * (n / c)) } +type chunkStat struct { + weight float32 // relative weight compared to other chunks. +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) // scalarsMont indicates wheter the provided scalars are in montgomery form -// returns smallValues, which represent the number of scalars which meets the following condition -// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) { // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) - toReturn := make([]uint16, len(scalars)*int(nbChunks)) + digits := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 - // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window - max := int(1 << (c - 1)) // max value we want for our digits - cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) @@ -579,36 +554,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - // for each chunk, we could track the number of non-zeros points we will need to process - // this way, if a chunk has more work to do than others, we can spawn off more go routines - // (at the cost of more buckets allocated) - // a simplified approach is to track the small values where only the first word is set - // if this number represent a significant number of points, then we will split first chunk - // processing in the msm in 2, to ensure all go routines finish at ~same time - // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine - // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) + chOpsPerChunk := make(chan []int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { - smallValues := 0 + opsPerChunk := make([]int, nbChunks) for i := start; i < end; i++ { - var carry int - scalar := scalars[i] if scalarsMont { scalar.FromMont() } - if scalar.FitsOnOneWord() { + if scalar.IsZero() { // everything is 0, no need to process this scalar - if scalar[0] == 0 { - continue - } - // low c-bits are 1 in mask - if scalar[0]&mask == scalar[0] { - smallValues++ - } + continue } + var carry int + // for each chunk in the scalar, compute the current digit, and an eventual carry for chunk := uint64(0); chunk < nbChunks; chunk++ { s := selectors[chunk] @@ -637,26 +598,39 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // if digit is zero, no impact on result if digit == 0 { continue - } else if digit > 0 { + } + if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } - toReturn[int(chunk)*len(scalars)+i] = bits + digits[int(chunk)*len(scalars)+i] = bits + opsPerChunk[chunk]++ } } - chSmallValues <- smallValues + chOpsPerChunk <- opsPerChunk }, nbTasks) - // aggregate small values - close(chSmallValues) - smallValues := 0 - for o := range chSmallValues { - smallValues += o + // aggregate chunk stats + close(chOpsPerChunk) + opsPerChunk := make([]int, nbChunks) + totalOps := 0 + for o := range chOpsPerChunk { + for i, nbOps := range o { + opsPerChunk[i] += nbOps + totalOps += nbOps + } } - return toReturn, smallValues + chunkStats := make([]chunkStat, nbChunks) + target := float32(totalOps) / float32(nbChunks) + // what percentage are you of the target + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + } + + return digits, chunkStats } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index 191875391e..4d7a2e07f5 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -129,16 +129,9 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - _innerMsmG1(p, C, points, digits, splitFirstChunk) + _innerMsmG1(p, C, points, digits, chunkStats) return p, nil } @@ -200,7 +193,7 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac { nbChunks := computeNbChunks(c) @@ -217,35 +210,29 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - processLastChunk := getChunkProcessorG1(lastC(c)) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) - for j := int(nbChunks - 2); j > 0; j-- { + for j := int(nbChunks - 1); j >= 0; j-- { processChunk := getChunkProcessorG1(c) - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] - // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed - // in the ~same amount of time - if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. - processChunk := getChunkProcessorG1(c) - if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, digits[:n]) - } else { + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG1(lastC(c)) + } + if chunkStats[j].weight >= 150.0 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], digits[:split]) - go processChunk(0, chSplit, c, points[split:], digits[split:n]) + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) go func() { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[0] <- s1 + chChunks[j] <- s1 }() + continue } - + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) @@ -371,16 +358,9 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - _innerMsmG2(p, C, points, digits, splitFirstChunk) + _innerMsmG2(p, C, points, digits, chunkStats) return p, nil } @@ -442,7 +422,7 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac { nbChunks := computeNbChunks(c) @@ -459,35 +439,29 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - processLastChunk := getChunkProcessorG2(lastC(c)) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) - for j := int(nbChunks - 2); j > 0; j-- { + for j := int(nbChunks - 1); j >= 0; j-- { processChunk := getChunkProcessorG2(c) - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] - // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed - // in the ~same amount of time - if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. - processChunk := getChunkProcessorG2(c) - if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, digits[:n]) - } else { + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG2(lastC(c)) + } + if chunkStats[j].weight >= 150.0 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], digits[:split]) - go processChunk(0, chSplit, c, points[split:], digits[split:n]) + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) go func() { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[0] <- s1 + chChunks[j] <- s1 }() + continue } - + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } return msmReduceChunkG2Affine(p, int(c), chChunks[:]) @@ -543,24 +517,25 @@ func lastC(c uint64) uint64 { return n - (c * (n / c)) } +type chunkStat struct { + weight float32 // relative weight compared to other chunks. +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) // scalarsMont indicates wheter the provided scalars are in montgomery form -// returns smallValues, which represent the number of scalars which meets the following condition -// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) { // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) - toReturn := make([]uint16, len(scalars)*int(nbChunks)) + digits := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 - // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window - max := int(1 << (c - 1)) // max value we want for our digits - cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) @@ -579,36 +554,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - // for each chunk, we could track the number of non-zeros points we will need to process - // this way, if a chunk has more work to do than others, we can spawn off more go routines - // (at the cost of more buckets allocated) - // a simplified approach is to track the small values where only the first word is set - // if this number represent a significant number of points, then we will split first chunk - // processing in the msm in 2, to ensure all go routines finish at ~same time - // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine - // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) + chOpsPerChunk := make(chan []int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { - smallValues := 0 + opsPerChunk := make([]int, nbChunks) for i := start; i < end; i++ { - var carry int - scalar := scalars[i] if scalarsMont { scalar.FromMont() } - if scalar.FitsOnOneWord() { + if scalar.IsZero() { // everything is 0, no need to process this scalar - if scalar[0] == 0 { - continue - } - // low c-bits are 1 in mask - if scalar[0]&mask == scalar[0] { - smallValues++ - } + continue } + var carry int + // for each chunk in the scalar, compute the current digit, and an eventual carry for chunk := uint64(0); chunk < nbChunks; chunk++ { s := selectors[chunk] @@ -637,26 +598,39 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // if digit is zero, no impact on result if digit == 0 { continue - } else if digit > 0 { + } + if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } - toReturn[int(chunk)*len(scalars)+i] = bits + digits[int(chunk)*len(scalars)+i] = bits + opsPerChunk[chunk]++ } } - chSmallValues <- smallValues + chOpsPerChunk <- opsPerChunk }, nbTasks) - // aggregate small values - close(chSmallValues) - smallValues := 0 - for o := range chSmallValues { - smallValues += o + // aggregate chunk stats + close(chOpsPerChunk) + opsPerChunk := make([]int, nbChunks) + totalOps := 0 + for o := range chOpsPerChunk { + for i, nbOps := range o { + opsPerChunk[i] += nbOps + totalOps += nbOps + } } - return toReturn, smallValues + chunkStats := make([]chunkStat, nbChunks) + target := float32(totalOps) / float32(nbChunks) + // what percentage are you of the target + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + } + + return digits, chunkStats } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index 894b89a73d..3f49fd1afd 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -129,16 +129,9 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - _innerMsmG1(p, C, points, digits, splitFirstChunk) + _innerMsmG1(p, C, points, digits, chunkStats) return p, nil } @@ -200,7 +193,7 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac { nbChunks := computeNbChunks(c) @@ -217,35 +210,29 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - processLastChunk := getChunkProcessorG1(lastC(c)) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) - for j := int(nbChunks - 2); j > 0; j-- { + for j := int(nbChunks - 1); j >= 0; j-- { processChunk := getChunkProcessorG1(c) - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] - // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed - // in the ~same amount of time - if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. - processChunk := getChunkProcessorG1(c) - if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, digits[:n]) - } else { + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG1(lastC(c)) + } + if chunkStats[j].weight >= 150.0 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], digits[:split]) - go processChunk(0, chSplit, c, points[split:], digits[split:n]) + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) go func() { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[0] <- s1 + chChunks[j] <- s1 }() + continue } - + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) @@ -371,16 +358,9 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - _innerMsmG2(p, C, points, digits, splitFirstChunk) + _innerMsmG2(p, C, points, digits, chunkStats) return p, nil } @@ -442,7 +422,7 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac { nbChunks := computeNbChunks(c) @@ -459,35 +439,29 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - processLastChunk := getChunkProcessorG2(lastC(c)) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) - for j := int(nbChunks - 2); j > 0; j-- { + for j := int(nbChunks - 1); j >= 0; j-- { processChunk := getChunkProcessorG2(c) - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] - // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed - // in the ~same amount of time - if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. - processChunk := getChunkProcessorG2(c) - if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, digits[:n]) - } else { + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG2(lastC(c)) + } + if chunkStats[j].weight >= 150.0 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], digits[:split]) - go processChunk(0, chSplit, c, points[split:], digits[split:n]) + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) go func() { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[0] <- s1 + chChunks[j] <- s1 }() + continue } - + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } return msmReduceChunkG2Affine(p, int(c), chChunks[:]) @@ -543,24 +517,25 @@ func lastC(c uint64) uint64 { return n - (c * (n / c)) } +type chunkStat struct { + weight float32 // relative weight compared to other chunks. +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) // scalarsMont indicates wheter the provided scalars are in montgomery form -// returns smallValues, which represent the number of scalars which meets the following condition -// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) { // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) - toReturn := make([]uint16, len(scalars)*int(nbChunks)) + digits := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 - // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window - max := int(1 << (c - 1)) // max value we want for our digits - cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) @@ -579,36 +554,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - // for each chunk, we could track the number of non-zeros points we will need to process - // this way, if a chunk has more work to do than others, we can spawn off more go routines - // (at the cost of more buckets allocated) - // a simplified approach is to track the small values where only the first word is set - // if this number represent a significant number of points, then we will split first chunk - // processing in the msm in 2, to ensure all go routines finish at ~same time - // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine - // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) + chOpsPerChunk := make(chan []int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { - smallValues := 0 + opsPerChunk := make([]int, nbChunks) for i := start; i < end; i++ { - var carry int - scalar := scalars[i] if scalarsMont { scalar.FromMont() } - if scalar.FitsOnOneWord() { + if scalar.IsZero() { // everything is 0, no need to process this scalar - if scalar[0] == 0 { - continue - } - // low c-bits are 1 in mask - if scalar[0]&mask == scalar[0] { - smallValues++ - } + continue } + var carry int + // for each chunk in the scalar, compute the current digit, and an eventual carry for chunk := uint64(0); chunk < nbChunks; chunk++ { s := selectors[chunk] @@ -637,26 +598,39 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // if digit is zero, no impact on result if digit == 0 { continue - } else if digit > 0 { + } + if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } - toReturn[int(chunk)*len(scalars)+i] = bits + digits[int(chunk)*len(scalars)+i] = bits + opsPerChunk[chunk]++ } } - chSmallValues <- smallValues + chOpsPerChunk <- opsPerChunk }, nbTasks) - // aggregate small values - close(chSmallValues) - smallValues := 0 - for o := range chSmallValues { - smallValues += o + // aggregate chunk stats + close(chOpsPerChunk) + opsPerChunk := make([]int, nbChunks) + totalOps := 0 + for o := range chOpsPerChunk { + for i, nbOps := range o { + opsPerChunk[i] += nbOps + totalOps += nbOps + } } - return toReturn, smallValues + chunkStats := make([]chunkStat, nbChunks) + target := float32(totalOps) / float32(nbChunks) + // what percentage are you of the target + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + } + + return digits, chunkStats } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index 7cd831c238..29548cc89c 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -129,16 +129,9 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - _innerMsmG1(p, C, points, digits, splitFirstChunk) + _innerMsmG1(p, C, points, digits, chunkStats) return p, nil } @@ -200,7 +193,7 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac { nbChunks := computeNbChunks(c) @@ -217,35 +210,29 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - processLastChunk := getChunkProcessorG1(lastC(c)) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) - for j := int(nbChunks - 2); j > 0; j-- { + for j := int(nbChunks - 1); j >= 0; j-- { processChunk := getChunkProcessorG1(c) - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] - // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed - // in the ~same amount of time - if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. - processChunk := getChunkProcessorG1(c) - if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, digits[:n]) - } else { + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG1(lastC(c)) + } + if chunkStats[j].weight >= 150.0 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], digits[:split]) - go processChunk(0, chSplit, c, points[split:], digits[split:n]) + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) go func() { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[0] <- s1 + chChunks[j] <- s1 }() + continue } - + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) @@ -371,16 +358,9 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - _innerMsmG2(p, C, points, digits, splitFirstChunk) + _innerMsmG2(p, C, points, digits, chunkStats) return p, nil } @@ -442,7 +422,7 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac { nbChunks := computeNbChunks(c) @@ -459,35 +439,29 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - processLastChunk := getChunkProcessorG2(lastC(c)) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) - for j := int(nbChunks - 2); j > 0; j-- { + for j := int(nbChunks - 1); j >= 0; j-- { processChunk := getChunkProcessorG2(c) - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] - // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed - // in the ~same amount of time - if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. - processChunk := getChunkProcessorG2(c) - if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, digits[:n]) - } else { + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG2(lastC(c)) + } + if chunkStats[j].weight >= 150.0 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], digits[:split]) - go processChunk(0, chSplit, c, points[split:], digits[split:n]) + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) go func() { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[0] <- s1 + chChunks[j] <- s1 }() + continue } - + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } return msmReduceChunkG2Affine(p, int(c), chChunks[:]) @@ -543,24 +517,25 @@ func lastC(c uint64) uint64 { return n - (c * (n / c)) } +type chunkStat struct { + weight float32 // relative weight compared to other chunks. +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) // scalarsMont indicates wheter the provided scalars are in montgomery form -// returns smallValues, which represent the number of scalars which meets the following condition -// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) { // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) - toReturn := make([]uint16, len(scalars)*int(nbChunks)) + digits := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 - // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window - max := int(1 << (c - 1)) // max value we want for our digits - cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) @@ -579,36 +554,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - // for each chunk, we could track the number of non-zeros points we will need to process - // this way, if a chunk has more work to do than others, we can spawn off more go routines - // (at the cost of more buckets allocated) - // a simplified approach is to track the small values where only the first word is set - // if this number represent a significant number of points, then we will split first chunk - // processing in the msm in 2, to ensure all go routines finish at ~same time - // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine - // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) + chOpsPerChunk := make(chan []int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { - smallValues := 0 + opsPerChunk := make([]int, nbChunks) for i := start; i < end; i++ { - var carry int - scalar := scalars[i] if scalarsMont { scalar.FromMont() } - if scalar.FitsOnOneWord() { + if scalar.IsZero() { // everything is 0, no need to process this scalar - if scalar[0] == 0 { - continue - } - // low c-bits are 1 in mask - if scalar[0]&mask == scalar[0] { - smallValues++ - } + continue } + var carry int + // for each chunk in the scalar, compute the current digit, and an eventual carry for chunk := uint64(0); chunk < nbChunks; chunk++ { s := selectors[chunk] @@ -637,26 +598,39 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // if digit is zero, no impact on result if digit == 0 { continue - } else if digit > 0 { + } + if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } - toReturn[int(chunk)*len(scalars)+i] = bits + digits[int(chunk)*len(scalars)+i] = bits + opsPerChunk[chunk]++ } } - chSmallValues <- smallValues + chOpsPerChunk <- opsPerChunk }, nbTasks) - // aggregate small values - close(chSmallValues) - smallValues := 0 - for o := range chSmallValues { - smallValues += o + // aggregate chunk stats + close(chOpsPerChunk) + opsPerChunk := make([]int, nbChunks) + totalOps := 0 + for o := range chOpsPerChunk { + for i, nbOps := range o { + opsPerChunk[i] += nbOps + totalOps += nbOps + } } - return toReturn, smallValues + chunkStats := make([]chunkStat, nbChunks) + target := float32(totalOps) / float32(nbChunks) + // what percentage are you of the target + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + } + + return digits, chunkStats } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index 9b3140de09..87cee65e19 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -129,16 +129,9 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - _innerMsmG1(p, C, points, digits, splitFirstChunk) + _innerMsmG1(p, C, points, digits, chunkStats) return p, nil } @@ -200,7 +193,7 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac { nbChunks := computeNbChunks(c) @@ -217,35 +210,29 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - processLastChunk := getChunkProcessorG1(lastC(c)) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) - for j := int(nbChunks - 2); j > 0; j-- { + for j := int(nbChunks - 1); j >= 0; j-- { processChunk := getChunkProcessorG1(c) - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] - // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed - // in the ~same amount of time - if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. - processChunk := getChunkProcessorG1(c) - if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, digits[:n]) - } else { + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG1(lastC(c)) + } + if chunkStats[j].weight >= 150.0 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], digits[:split]) - go processChunk(0, chSplit, c, points[split:], digits[split:n]) + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) go func() { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[0] <- s1 + chChunks[j] <- s1 }() + continue } - + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) @@ -371,16 +358,9 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - _innerMsmG2(p, C, points, digits, splitFirstChunk) + _innerMsmG2(p, C, points, digits, chunkStats) return p, nil } @@ -442,7 +422,7 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac { nbChunks := computeNbChunks(c) @@ -459,35 +439,29 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - processLastChunk := getChunkProcessorG2(lastC(c)) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) - for j := int(nbChunks - 2); j > 0; j-- { + for j := int(nbChunks - 1); j >= 0; j-- { processChunk := getChunkProcessorG2(c) - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] - // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed - // in the ~same amount of time - if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. - processChunk := getChunkProcessorG2(c) - if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, digits[:n]) - } else { + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG2(lastC(c)) + } + if chunkStats[j].weight >= 150.0 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], digits[:split]) - go processChunk(0, chSplit, c, points[split:], digits[split:n]) + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) go func() { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[0] <- s1 + chChunks[j] <- s1 }() + continue } - + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } return msmReduceChunkG2Affine(p, int(c), chChunks[:]) @@ -543,24 +517,25 @@ func lastC(c uint64) uint64 { return n - (c * (n / c)) } +type chunkStat struct { + weight float32 // relative weight compared to other chunks. +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) // scalarsMont indicates wheter the provided scalars are in montgomery form -// returns smallValues, which represent the number of scalars which meets the following condition -// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) { // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) - toReturn := make([]uint16, len(scalars)*int(nbChunks)) + digits := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 - // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window - max := int(1 << (c - 1)) // max value we want for our digits - cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) @@ -579,36 +554,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - // for each chunk, we could track the number of non-zeros points we will need to process - // this way, if a chunk has more work to do than others, we can spawn off more go routines - // (at the cost of more buckets allocated) - // a simplified approach is to track the small values where only the first word is set - // if this number represent a significant number of points, then we will split first chunk - // processing in the msm in 2, to ensure all go routines finish at ~same time - // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine - // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) + chOpsPerChunk := make(chan []int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { - smallValues := 0 + opsPerChunk := make([]int, nbChunks) for i := start; i < end; i++ { - var carry int - scalar := scalars[i] if scalarsMont { scalar.FromMont() } - if scalar.FitsOnOneWord() { + if scalar.IsZero() { // everything is 0, no need to process this scalar - if scalar[0] == 0 { - continue - } - // low c-bits are 1 in mask - if scalar[0]&mask == scalar[0] { - smallValues++ - } + continue } + var carry int + // for each chunk in the scalar, compute the current digit, and an eventual carry for chunk := uint64(0); chunk < nbChunks; chunk++ { s := selectors[chunk] @@ -637,26 +598,39 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // if digit is zero, no impact on result if digit == 0 { continue - } else if digit > 0 { + } + if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } - toReturn[int(chunk)*len(scalars)+i] = bits + digits[int(chunk)*len(scalars)+i] = bits + opsPerChunk[chunk]++ } } - chSmallValues <- smallValues + chOpsPerChunk <- opsPerChunk }, nbTasks) - // aggregate small values - close(chSmallValues) - smallValues := 0 - for o := range chSmallValues { - smallValues += o + // aggregate chunk stats + close(chOpsPerChunk) + opsPerChunk := make([]int, nbChunks) + totalOps := 0 + for o := range chOpsPerChunk { + for i, nbOps := range o { + opsPerChunk[i] += nbOps + totalOps += nbOps + } } - return toReturn, smallValues + chunkStats := make([]chunkStat, nbChunks) + target := float32(totalOps) / float32(nbChunks) + // what percentage are you of the target + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + } + + return digits, chunkStats } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index 71170c6cf9..d77d85346f 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -129,16 +129,9 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - _innerMsmG1(p, C, points, digits, splitFirstChunk) + _innerMsmG1(p, C, points, digits, chunkStats) return p, nil } @@ -164,7 +157,7 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac { nbChunks := computeNbChunks(c) @@ -181,35 +174,29 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - processLastChunk := getChunkProcessorG1(lastC(c)) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) - for j := int(nbChunks - 2); j > 0; j-- { + for j := int(nbChunks - 1); j >= 0; j-- { processChunk := getChunkProcessorG1(c) - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] - // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed - // in the ~same amount of time - if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. - processChunk := getChunkProcessorG1(c) - if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, digits[:n]) - } else { + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG1(lastC(c)) + } + if chunkStats[j].weight >= 150.0 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], digits[:split]) - go processChunk(0, chSplit, c, points[split:], digits[split:n]) + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) go func() { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[0] <- s1 + chChunks[j] <- s1 }() + continue } - + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) @@ -335,16 +322,9 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - _innerMsmG2(p, C, points, digits, splitFirstChunk) + _innerMsmG2(p, C, points, digits, chunkStats) return p, nil } @@ -370,7 +350,7 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac { nbChunks := computeNbChunks(c) @@ -387,35 +367,29 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - processLastChunk := getChunkProcessorG2(lastC(c)) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) - for j := int(nbChunks - 2); j > 0; j-- { + for j := int(nbChunks - 1); j >= 0; j-- { processChunk := getChunkProcessorG2(c) - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] - // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed - // in the ~same amount of time - if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. - processChunk := getChunkProcessorG2(c) - if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, digits[:n]) - } else { + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG2(lastC(c)) + } + if chunkStats[j].weight >= 150.0 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], digits[:split]) - go processChunk(0, chSplit, c, points[split:], digits[split:n]) + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) go func() { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[0] <- s1 + chChunks[j] <- s1 }() + continue } - + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } return msmReduceChunkG2Affine(p, int(c), chChunks[:]) @@ -471,24 +445,25 @@ func lastC(c uint64) uint64 { return n - (c * (n / c)) } +type chunkStat struct { + weight float32 // relative weight compared to other chunks. +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) // scalarsMont indicates wheter the provided scalars are in montgomery form -// returns smallValues, which represent the number of scalars which meets the following condition -// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) { // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) - toReturn := make([]uint16, len(scalars)*int(nbChunks)) + digits := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 - // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window - max := int(1 << (c - 1)) // max value we want for our digits - cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) @@ -507,36 +482,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - // for each chunk, we could track the number of non-zeros points we will need to process - // this way, if a chunk has more work to do than others, we can spawn off more go routines - // (at the cost of more buckets allocated) - // a simplified approach is to track the small values where only the first word is set - // if this number represent a significant number of points, then we will split first chunk - // processing in the msm in 2, to ensure all go routines finish at ~same time - // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine - // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) + chOpsPerChunk := make(chan []int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { - smallValues := 0 + opsPerChunk := make([]int, nbChunks) for i := start; i < end; i++ { - var carry int - scalar := scalars[i] if scalarsMont { scalar.FromMont() } - if scalar.FitsOnOneWord() { + if scalar.IsZero() { // everything is 0, no need to process this scalar - if scalar[0] == 0 { - continue - } - // low c-bits are 1 in mask - if scalar[0]&mask == scalar[0] { - smallValues++ - } + continue } + var carry int + // for each chunk in the scalar, compute the current digit, and an eventual carry for chunk := uint64(0); chunk < nbChunks; chunk++ { s := selectors[chunk] @@ -565,26 +526,39 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // if digit is zero, no impact on result if digit == 0 { continue - } else if digit > 0 { + } + if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } - toReturn[int(chunk)*len(scalars)+i] = bits + digits[int(chunk)*len(scalars)+i] = bits + opsPerChunk[chunk]++ } } - chSmallValues <- smallValues + chOpsPerChunk <- opsPerChunk }, nbTasks) - // aggregate small values - close(chSmallValues) - smallValues := 0 - for o := range chSmallValues { - smallValues += o + // aggregate chunk stats + close(chOpsPerChunk) + opsPerChunk := make([]int, nbChunks) + totalOps := 0 + for o := range chOpsPerChunk { + for i, nbOps := range o { + opsPerChunk[i] += nbOps + totalOps += nbOps + } } - return toReturn, smallValues + chunkStats := make([]chunkStat, nbChunks) + target := float32(totalOps) / float32(nbChunks) + // what percentage are you of the target + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + } + + return digits, chunkStats } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index 28672b9219..ae313e1d52 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -129,16 +129,9 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - _innerMsmG1(p, C, points, digits, splitFirstChunk) + _innerMsmG1(p, C, points, digits, chunkStats) return p, nil } @@ -164,7 +157,7 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac { nbChunks := computeNbChunks(c) @@ -181,35 +174,29 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - processLastChunk := getChunkProcessorG1(lastC(c)) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) - for j := int(nbChunks - 2); j > 0; j-- { + for j := int(nbChunks - 1); j >= 0; j-- { processChunk := getChunkProcessorG1(c) - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] - // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed - // in the ~same amount of time - if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. - processChunk := getChunkProcessorG1(c) - if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, digits[:n]) - } else { + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG1(lastC(c)) + } + if chunkStats[j].weight >= 150.0 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], digits[:split]) - go processChunk(0, chSplit, c, points[split:], digits[split:n]) + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) go func() { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[0] <- s1 + chChunks[j] <- s1 }() + continue } - + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) @@ -335,16 +322,9 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - _innerMsmG2(p, C, points, digits, splitFirstChunk) + _innerMsmG2(p, C, points, digits, chunkStats) return p, nil } @@ -370,7 +350,7 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac { nbChunks := computeNbChunks(c) @@ -387,35 +367,29 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - processLastChunk := getChunkProcessorG2(lastC(c)) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) - for j := int(nbChunks - 2); j > 0; j-- { + for j := int(nbChunks - 1); j >= 0; j-- { processChunk := getChunkProcessorG2(c) - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] - // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed - // in the ~same amount of time - if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. - processChunk := getChunkProcessorG2(c) - if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, digits[:n]) - } else { + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG2(lastC(c)) + } + if chunkStats[j].weight >= 150.0 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], digits[:split]) - go processChunk(0, chSplit, c, points[split:], digits[split:n]) + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) go func() { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[0] <- s1 + chChunks[j] <- s1 }() + continue } - + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } return msmReduceChunkG2Affine(p, int(c), chChunks[:]) @@ -471,24 +445,25 @@ func lastC(c uint64) uint64 { return n - (c * (n / c)) } +type chunkStat struct { + weight float32 // relative weight compared to other chunks. +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) // scalarsMont indicates wheter the provided scalars are in montgomery form -// returns smallValues, which represent the number of scalars which meets the following condition -// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) { // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) - toReturn := make([]uint16, len(scalars)*int(nbChunks)) + digits := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 - // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window - max := int(1 << (c - 1)) // max value we want for our digits - cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) @@ -507,36 +482,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - // for each chunk, we could track the number of non-zeros points we will need to process - // this way, if a chunk has more work to do than others, we can spawn off more go routines - // (at the cost of more buckets allocated) - // a simplified approach is to track the small values where only the first word is set - // if this number represent a significant number of points, then we will split first chunk - // processing in the msm in 2, to ensure all go routines finish at ~same time - // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine - // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) + chOpsPerChunk := make(chan []int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { - smallValues := 0 + opsPerChunk := make([]int, nbChunks) for i := start; i < end; i++ { - var carry int - scalar := scalars[i] if scalarsMont { scalar.FromMont() } - if scalar.FitsOnOneWord() { + if scalar.IsZero() { // everything is 0, no need to process this scalar - if scalar[0] == 0 { - continue - } - // low c-bits are 1 in mask - if scalar[0]&mask == scalar[0] { - smallValues++ - } + continue } + var carry int + // for each chunk in the scalar, compute the current digit, and an eventual carry for chunk := uint64(0); chunk < nbChunks; chunk++ { s := selectors[chunk] @@ -565,26 +526,39 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // if digit is zero, no impact on result if digit == 0 { continue - } else if digit > 0 { + } + if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } - toReturn[int(chunk)*len(scalars)+i] = bits + digits[int(chunk)*len(scalars)+i] = bits + opsPerChunk[chunk]++ } } - chSmallValues <- smallValues + chOpsPerChunk <- opsPerChunk }, nbTasks) - // aggregate small values - close(chSmallValues) - smallValues := 0 - for o := range chSmallValues { - smallValues += o + // aggregate chunk stats + close(chOpsPerChunk) + opsPerChunk := make([]int, nbChunks) + totalOps := 0 + for o := range chOpsPerChunk { + for i, nbOps := range o { + opsPerChunk[i] += nbOps + totalOps += nbOps + } } - return toReturn, smallValues + chunkStats := make([]chunkStat, nbChunks) + target := float32(totalOps) / float32(nbChunks) + // what percentage are you of the target + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + } + + return digits, chunkStats } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index cc02ad9b57..1c7ae6f672 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -129,16 +129,9 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - _innerMsmG1(p, C, points, digits, splitFirstChunk) + _innerMsmG1(p, C, points, digits, chunkStats) return p, nil } @@ -164,7 +157,7 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac { nbChunks := computeNbChunks(c) @@ -181,35 +174,29 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - processLastChunk := getChunkProcessorG1(lastC(c)) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) - for j := int(nbChunks - 2); j > 0; j-- { + for j := int(nbChunks - 1); j >= 0; j-- { processChunk := getChunkProcessorG1(c) - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] - // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed - // in the ~same amount of time - if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. - processChunk := getChunkProcessorG1(c) - if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, digits[:n]) - } else { + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG1(lastC(c)) + } + if chunkStats[j].weight >= 150.0 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], digits[:split]) - go processChunk(0, chSplit, c, points[split:], digits[split:n]) + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) go func() { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[0] <- s1 + chChunks[j] <- s1 }() + continue } - + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } return msmReduceChunkG1Affine(p, int(c), chChunks[:]) @@ -335,16 +322,9 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - _innerMsmG2(p, C, points, digits, splitFirstChunk) + _innerMsmG2(p, C, points, digits, chunkStats) return p, nil } @@ -370,7 +350,7 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac { nbChunks := computeNbChunks(c) @@ -387,35 +367,29 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - processLastChunk := getChunkProcessorG2(lastC(c)) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) - for j := int(nbChunks - 2); j > 0; j-- { + for j := int(nbChunks - 1); j >= 0; j-- { processChunk := getChunkProcessorG2(c) - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] - // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed - // in the ~same amount of time - if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. - processChunk := getChunkProcessorG2(c) - if !splitFirstChunk { - go processChunk(0, chChunks[0], c, points, digits[:n]) - } else { + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG2(lastC(c)) + } + if chunkStats[j].weight >= 150.0 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 - go processChunk(0, chSplit, c, points[:split], digits[:split]) - go processChunk(0, chSplit, c, points[split:], digits[split:n]) + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) go func() { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[0] <- s1 + chChunks[j] <- s1 }() + continue } - + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } return msmReduceChunkG2Affine(p, int(c), chChunks[:]) @@ -471,24 +445,25 @@ func lastC(c uint64) uint64 { return n - (c * (n / c)) } +type chunkStat struct { + weight float32 // relative weight compared to other chunks. +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) // scalarsMont indicates wheter the provided scalars are in montgomery form -// returns smallValues, which represent the number of scalars which meets the following condition -// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) { // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) - toReturn := make([]uint16, len(scalars)*int(nbChunks)) + digits := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 - // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window - max := int(1 << (c - 1)) // max value we want for our digits - cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words // compute offset and word selector / shift to select the right bits of our windows selectors := make([]selector, nbChunks) @@ -507,36 +482,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - // for each chunk, we could track the number of non-zeros points we will need to process - // this way, if a chunk has more work to do than others, we can spawn off more go routines - // (at the cost of more buckets allocated) - // a simplified approach is to track the small values where only the first word is set - // if this number represent a significant number of points, then we will split first chunk - // processing in the msm in 2, to ensure all go routines finish at ~same time - // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine - // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) + chOpsPerChunk := make(chan []int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { - smallValues := 0 + opsPerChunk := make([]int, nbChunks) for i := start; i < end; i++ { - var carry int - scalar := scalars[i] if scalarsMont { scalar.FromMont() } - if scalar.FitsOnOneWord() { + if scalar.IsZero() { // everything is 0, no need to process this scalar - if scalar[0] == 0 { - continue - } - // low c-bits are 1 in mask - if scalar[0]&mask == scalar[0] { - smallValues++ - } + continue } + var carry int + // for each chunk in the scalar, compute the current digit, and an eventual carry for chunk := uint64(0); chunk < nbChunks; chunk++ { s := selectors[chunk] @@ -565,26 +526,39 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // if digit is zero, no impact on result if digit == 0 { continue - } else if digit > 0 { + } + if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } - toReturn[int(chunk)*len(scalars)+i] = bits + digits[int(chunk)*len(scalars)+i] = bits + opsPerChunk[chunk]++ } } - chSmallValues <- smallValues + chOpsPerChunk <- opsPerChunk }, nbTasks) - // aggregate small values - close(chSmallValues) - smallValues := 0 - for o := range chSmallValues { - smallValues += o + // aggregate chunk stats + close(chOpsPerChunk) + opsPerChunk := make([]int, nbChunks) + totalOps := 0 + for o := range chOpsPerChunk { + for i, nbOps := range o { + opsPerChunk[i] += nbOps + totalOps += nbOps + } } - return toReturn, smallValues + chunkStats := make([]chunkStat, nbChunks) + target := float32(totalOps) / float32(nbChunks) + // what percentage are you of the target + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + } + + return digits, chunkStats } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index ef2b67b636..4aa916fcfc 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -54,6 +54,10 @@ func lastC(c uint64) uint64 { return n - (c * (n / c)) } +type chunkStat struct { + weight float32 // relative weight compared to other chunks. +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract @@ -61,16 +65,13 @@ func lastC(c uint64) uint64 { // negative digits can be processed in a later step as adding -G into the bucket instead of G // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) // scalarsMont indicates wheter the provided scalars are in montgomery form -// returns smallValues, which represent the number of scalars which meets the following condition -// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) -func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) { +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) { // number of c-bit radixes in a scalar nbChunks := computeNbChunks(c) - toReturn := make([]uint16, len(scalars)*int(nbChunks)) + digits := make([]uint16, len(scalars)*int(nbChunks)) mask := uint64((1 << c) - 1) // low c bits are 1 - // msbWindow := uint64(1 << (c -1)) // msb of the c-bit window max := int(1 << (c -1)) // max value we want for our digits cDivides64 := (64 %c ) == 0 // if c doesn't divide 64, we may need to select over multiple words @@ -92,36 +93,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - // for each chunk, we could track the number of non-zeros points we will need to process - // this way, if a chunk has more work to do than others, we can spawn off more go routines - // (at the cost of more buckets allocated) - // a simplified approach is to track the small values where only the first word is set - // if this number represent a significant number of points, then we will split first chunk - // processing in the msm in 2, to ensure all go routines finish at ~same time - // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine - // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) + chOpsPerChunk := make(chan []int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { - smallValues := 0 + opsPerChunk := make([]int, nbChunks) for i:=start; i < end; i++ { - var carry int - scalar := scalars[i] if scalarsMont { scalar.FromMont() } - if scalar.FitsOnOneWord() { + if scalar.IsZero() { // everything is 0, no need to process this scalar - if scalar[0] == 0 { - continue - } - // low c-bits are 1 in mask - if scalar[0]&mask == scalar[0] { - smallValues++ - } + continue } + var carry int + // for each chunk in the scalar, compute the current digit, and an eventual carry for chunk := uint64(0); chunk < nbChunks; chunk++ { s := selectors[chunk] @@ -151,27 +138,40 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // if digit is zero, no impact on result if digit == 0 { continue - } else if digit > 0 { + } + if digit > 0 { bits = uint16(digit) << 1 } else { bits = (uint16(-digit-1) << 1) + 1 } - toReturn[int(chunk)*len(scalars)+i] = bits + digits[int(chunk)*len(scalars)+i] = bits + opsPerChunk[chunk]++ } } - chSmallValues <- smallValues + chOpsPerChunk <- opsPerChunk }, nbTasks) - // aggregate small values - close(chSmallValues) - smallValues := 0 - for o := range chSmallValues { - smallValues+=o + // aggregate chunk stats + close(chOpsPerChunk) + opsPerChunk := make([]int, nbChunks) + totalOps := 0 + for o := range chOpsPerChunk { + for i, nbOps := range o { + opsPerChunk[i]+=nbOps + totalOps += nbOps + } } - return toReturn, smallValues + chunkStats := make([]chunkStat, nbChunks) + target := float32(totalOps) / float32(nbChunks) + // what percentage are you of the target + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + } + + return digits, chunkStats } @@ -412,16 +412,9 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem } // partition the scalars - // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) - // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window - // var smallValues int - digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - // if we have more than 10% of small values, we split the processing of the first chunk in 2 - // we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time - splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 - - _innerMsm{{ $.UPointName }}(p, C, points, digits, splitFirstChunk) + _innerMsm{{ $.UPointName }}(p, C, points, digits, chunkStats) return p, nil } @@ -447,7 +440,7 @@ func getChunkProcessor{{ $.UPointName }}(c uint64 /* some other params to determ } } -func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint16, splitFirstChunk bool) *{{ $.TJacobian }} { +func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint16, chunkStats []chunkStat) *{{ $.TJacobian }} { nbChunks := computeNbChunks(c) @@ -464,37 +457,31 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - processLastChunk := getChunkProcessor{{ $.UPointName }}(lastC(c)) - go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:]) - - for j := int(nbChunks - 2); j >0; j-- { - processChunk := getChunkProcessor{{ $.UPointName }}(c) - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - // the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1] - // --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed - // in the ~same amount of time - if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen. + for j := int(nbChunks - 1); j >= 0; j-- { processChunk := getChunkProcessor{{ $.UPointName }}(c) - if !splitFirstChunk { - go processChunk(0,chChunks[0], c, points, digits[:n]) - } else { + if j == int(nbChunks - 1) { + processChunk = getChunkProcessor{{ $.UPointName }}(lastC(c)) + } + if chunkStats[j].weight >= 150.0 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. chSplit := make(chan {{ $.TJacobianExtended }}, 2) split := n / 2 - go processChunk(0,chSplit, c, points[:split], digits[:split]) - go processChunk(0,chSplit, c, points[split:], digits[split:n]) + go processChunk(uint64(j),chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j),chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) go func() { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[0] <- s1 + chChunks[j] <- s1 }() - } - + continue + } + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } - + return msmReduceChunk{{ $.TAffine }}(p, int(c), chChunks[:]) } From 989a932fbe139eabb32b4264f98bbcb325d23dbb Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 15 Nov 2022 11:57:01 -0600 Subject: [PATCH 25/43] test: added msm benchmarks with small values and redundancy --- ecc/bls12-377/multiexp_affine.go | 2 + ecc/bls12-377/multiexp_test.go | 76 ++++++++++++++++++- ecc/bls12-378/multiexp_affine.go | 2 + ecc/bls12-378/multiexp_test.go | 76 ++++++++++++++++++- ecc/bls12-381/multiexp_affine.go | 2 + ecc/bls12-381/multiexp_test.go | 76 ++++++++++++++++++- ecc/bls24-315/multiexp_affine.go | 2 + ecc/bls24-315/multiexp_test.go | 76 ++++++++++++++++++- ecc/bls24-317/multiexp_affine.go | 2 + ecc/bls24-317/multiexp_test.go | 76 ++++++++++++++++++- ecc/bn254/multiexp_affine.go | 2 + ecc/bn254/multiexp_test.go | 76 ++++++++++++++++++- ecc/bw6-633/multiexp_affine.go | 2 + ecc/bw6-633/multiexp_test.go | 76 ++++++++++++++++++- ecc/bw6-756/multiexp_affine.go | 2 + ecc/bw6-756/multiexp_test.go | 76 ++++++++++++++++++- ecc/bw6-761/multiexp_affine.go | 2 + ecc/bw6-761/multiexp_test.go | 76 ++++++++++++++++++- .../ecc/template/multiexp_affine.go.tmpl | 1 + .../ecc/template/tests/multiexp.go.tmpl | 34 +++++++++ 20 files changed, 701 insertions(+), 36 deletions(-) diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index 8f33a80438..65e531185b 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -164,6 +164,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() + processQueue() } } @@ -443,6 +444,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() + processQueue() } } diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index 347fb0ab6b..3c8cf6ff05 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -222,11 +222,31 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // should split the scalars + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine + for i := 0; i < len(sampleScalarsRedundant); i += 10 { + for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } + fillBenchBasesG1(samplePoints[:]) var testPoint G1Affine @@ -240,6 +260,20 @@ func BenchmarkMultiExpG1(b *testing.B) { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -503,11 +537,31 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // should split the scalars + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine + for i := 0; i < len(sampleScalarsRedundant); i += 10 { + for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } + fillBenchBasesG2(samplePoints[:]) var testPoint G2Affine @@ -521,6 +575,20 @@ func BenchmarkMultiExpG2(b *testing.B) { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index 09ee239d09..f48f316a4a 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -164,6 +164,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() + processQueue() } } @@ -443,6 +444,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() + processQueue() } } diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index 90b1321a9a..d693b2c8d7 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -222,11 +222,31 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // should split the scalars + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine + for i := 0; i < len(sampleScalarsRedundant); i += 10 { + for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } + fillBenchBasesG1(samplePoints[:]) var testPoint G1Affine @@ -240,6 +260,20 @@ func BenchmarkMultiExpG1(b *testing.B) { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -503,11 +537,31 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // should split the scalars + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine + for i := 0; i < len(sampleScalarsRedundant); i += 10 { + for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } + fillBenchBasesG2(samplePoints[:]) var testPoint G2Affine @@ -521,6 +575,20 @@ func BenchmarkMultiExpG2(b *testing.B) { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index f452ac210e..2e3776394d 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -164,6 +164,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() + processQueue() } } @@ -443,6 +444,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() + processQueue() } } diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index 5b0b8eb7cc..c22d9bc508 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -222,11 +222,31 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // should split the scalars + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine + for i := 0; i < len(sampleScalarsRedundant); i += 10 { + for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } + fillBenchBasesG1(samplePoints[:]) var testPoint G1Affine @@ -240,6 +260,20 @@ func BenchmarkMultiExpG1(b *testing.B) { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -503,11 +537,31 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // should split the scalars + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine + for i := 0; i < len(sampleScalarsRedundant); i += 10 { + for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } + fillBenchBasesG2(samplePoints[:]) var testPoint G2Affine @@ -521,6 +575,20 @@ func BenchmarkMultiExpG2(b *testing.B) { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index fa683c78a1..d253497f17 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -164,6 +164,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() + processQueue() } } @@ -443,6 +444,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() + processQueue() } } diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index 9bda4cefd3..6c169ffc9c 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -222,11 +222,31 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // should split the scalars + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine + for i := 0; i < len(sampleScalarsRedundant); i += 10 { + for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } + fillBenchBasesG1(samplePoints[:]) var testPoint G1Affine @@ -240,6 +260,20 @@ func BenchmarkMultiExpG1(b *testing.B) { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -503,11 +537,31 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // should split the scalars + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine + for i := 0; i < len(sampleScalarsRedundant); i += 10 { + for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } + fillBenchBasesG2(samplePoints[:]) var testPoint G2Affine @@ -521,6 +575,20 @@ func BenchmarkMultiExpG2(b *testing.B) { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index 913b2e9308..d6a509fc82 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -164,6 +164,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() + processQueue() } } @@ -443,6 +444,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() + processQueue() } } diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index c166598a34..5b36edf2ee 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -222,11 +222,31 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // should split the scalars + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine + for i := 0; i < len(sampleScalarsRedundant); i += 10 { + for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } + fillBenchBasesG1(samplePoints[:]) var testPoint G1Affine @@ -240,6 +260,20 @@ func BenchmarkMultiExpG1(b *testing.B) { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -503,11 +537,31 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // should split the scalars + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine + for i := 0; i < len(sampleScalarsRedundant); i += 10 { + for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } + fillBenchBasesG2(samplePoints[:]) var testPoint G2Affine @@ -521,6 +575,20 @@ func BenchmarkMultiExpG2(b *testing.B) { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index 8f6e9073b8..939a1b71f2 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -164,6 +164,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() + processQueue() } } @@ -443,6 +444,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() + processQueue() } } diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index 5962e0b859..67cfc85953 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -222,11 +222,31 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // should split the scalars + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine + for i := 0; i < len(sampleScalarsRedundant); i += 10 { + for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } + fillBenchBasesG1(samplePoints[:]) var testPoint G1Affine @@ -240,6 +260,20 @@ func BenchmarkMultiExpG1(b *testing.B) { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -503,11 +537,31 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // should split the scalars + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine + for i := 0; i < len(sampleScalarsRedundant); i += 10 { + for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } + fillBenchBasesG2(samplePoints[:]) var testPoint G2Affine @@ -521,6 +575,20 @@ func BenchmarkMultiExpG2(b *testing.B) { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index 870483e934..f3c51b51bf 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -163,6 +163,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() + processQueue() } } @@ -370,6 +371,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() + processQueue() } } diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index 45bef3125a..a65f05c5c2 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -222,11 +222,31 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // should split the scalars + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine + for i := 0; i < len(sampleScalarsRedundant); i += 10 { + for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } + fillBenchBasesG1(samplePoints[:]) var testPoint G1Affine @@ -240,6 +260,20 @@ func BenchmarkMultiExpG1(b *testing.B) { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -503,11 +537,31 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // should split the scalars + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine + for i := 0; i < len(sampleScalarsRedundant); i += 10 { + for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } + fillBenchBasesG2(samplePoints[:]) var testPoint G2Affine @@ -521,6 +575,20 @@ func BenchmarkMultiExpG2(b *testing.B) { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index ca6e7c172a..0925a40c35 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -163,6 +163,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() + processQueue() } } @@ -370,6 +371,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() + processQueue() } } diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index 57956e9c5a..5b81aea368 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -222,11 +222,31 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // should split the scalars + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine + for i := 0; i < len(sampleScalarsRedundant); i += 10 { + for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } + fillBenchBasesG1(samplePoints[:]) var testPoint G1Affine @@ -240,6 +260,20 @@ func BenchmarkMultiExpG1(b *testing.B) { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -503,11 +537,31 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // should split the scalars + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine + for i := 0; i < len(sampleScalarsRedundant); i += 10 { + for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } + fillBenchBasesG2(samplePoints[:]) var testPoint G2Affine @@ -521,6 +575,20 @@ func BenchmarkMultiExpG2(b *testing.B) { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index 3b653b6563..6b4ec532ea 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -163,6 +163,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() + processQueue() } } @@ -370,6 +371,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() + processQueue() } } diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index 968613803a..38283453c4 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -222,11 +222,31 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // should split the scalars + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine + for i := 0; i < len(sampleScalarsRedundant); i += 10 { + for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } + fillBenchBasesG1(samplePoints[:]) var testPoint G1Affine @@ -240,6 +260,20 @@ func BenchmarkMultiExpG1(b *testing.B) { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -503,11 +537,31 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // should split the scalars + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine + for i := 0; i < len(sampleScalarsRedundant); i += 10 { + for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } + fillBenchBasesG2(samplePoints[:]) var testPoint G2Affine @@ -521,6 +575,20 @@ func BenchmarkMultiExpG2(b *testing.B) { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index 8462321866..513b06f96f 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -173,6 +173,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() + processQueue() } } diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index 455142ceb6..5e0298d575 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -237,9 +237,29 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { var ( samplePoints [nbSamples]{{ $.TAffine }} sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) + copy(sampleScalarsSmallValues[:],sampleScalars[:]) + copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // should split the scalars + for i:=0; i < len(sampleScalarsSmallValues);i++ { + if i % 5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine + for i:=0; i < len(sampleScalarsRedundant);i+=10 { + for j:=i+1; j < i+10 && j < len(sampleScalarsRedundant);j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } + fillBenchBases{{ toUpper $.PointName }}(samplePoints[:]) @@ -254,6 +274,20 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { testPoint.MultiExp(samplePoints[:using], sampleScalars[:using],ecc.MultiExpConfig{}) } }) + + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + } + }) } } From 52e5eaa8405c9f3ae5ae60c0933b1899d5c67ab8 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 15 Nov 2022 12:28:04 -0600 Subject: [PATCH 26/43] test: update worst case benchmark for batch affine msm --- ecc/bls12-377/multiexp_test.go | 12 ++++++------ ecc/bls12-378/multiexp_test.go | 12 ++++++------ ecc/bls12-381/multiexp_test.go | 12 ++++++------ ecc/bls24-315/multiexp_test.go | 12 ++++++------ ecc/bls24-317/multiexp_test.go | 12 ++++++------ ecc/bn254/multiexp_test.go | 12 ++++++------ ecc/bw6-633/multiexp_test.go | 12 ++++++------ ecc/bw6-756/multiexp_test.go | 12 ++++++------ ecc/bw6-761/multiexp_test.go | 12 ++++++------ .../generator/ecc/template/tests/multiexp.go.tmpl | 6 +++--- 10 files changed, 57 insertions(+), 57 deletions(-) diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index 3c8cf6ff05..6425235853 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -232,7 +232,7 @@ func BenchmarkMultiExpG1(b *testing.B) { copy(sampleScalarsSmallValues[:], sampleScalars[:]) copy(sampleScalarsRedundant[:], sampleScalars[:]) - // should split the scalars + // this means first chunk is going to have more work to do and should be split into several go routines for i := 0; i < len(sampleScalarsSmallValues); i++ { if i%5 == 0 { sampleScalarsSmallValues[i].SetZero() @@ -241,8 +241,8 @@ func BenchmarkMultiExpG1(b *testing.B) { } // bad case for batch affine - for i := 0; i < len(sampleScalarsRedundant); i += 10 { - for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] } } @@ -547,7 +547,7 @@ func BenchmarkMultiExpG2(b *testing.B) { copy(sampleScalarsSmallValues[:], sampleScalars[:]) copy(sampleScalarsRedundant[:], sampleScalars[:]) - // should split the scalars + // this means first chunk is going to have more work to do and should be split into several go routines for i := 0; i < len(sampleScalarsSmallValues); i++ { if i%5 == 0 { sampleScalarsSmallValues[i].SetZero() @@ -556,8 +556,8 @@ func BenchmarkMultiExpG2(b *testing.B) { } // bad case for batch affine - for i := 0; i < len(sampleScalarsRedundant); i += 10 { - for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] } } diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index d693b2c8d7..1b586aba7e 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -232,7 +232,7 @@ func BenchmarkMultiExpG1(b *testing.B) { copy(sampleScalarsSmallValues[:], sampleScalars[:]) copy(sampleScalarsRedundant[:], sampleScalars[:]) - // should split the scalars + // this means first chunk is going to have more work to do and should be split into several go routines for i := 0; i < len(sampleScalarsSmallValues); i++ { if i%5 == 0 { sampleScalarsSmallValues[i].SetZero() @@ -241,8 +241,8 @@ func BenchmarkMultiExpG1(b *testing.B) { } // bad case for batch affine - for i := 0; i < len(sampleScalarsRedundant); i += 10 { - for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] } } @@ -547,7 +547,7 @@ func BenchmarkMultiExpG2(b *testing.B) { copy(sampleScalarsSmallValues[:], sampleScalars[:]) copy(sampleScalarsRedundant[:], sampleScalars[:]) - // should split the scalars + // this means first chunk is going to have more work to do and should be split into several go routines for i := 0; i < len(sampleScalarsSmallValues); i++ { if i%5 == 0 { sampleScalarsSmallValues[i].SetZero() @@ -556,8 +556,8 @@ func BenchmarkMultiExpG2(b *testing.B) { } // bad case for batch affine - for i := 0; i < len(sampleScalarsRedundant); i += 10 { - for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] } } diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index c22d9bc508..2b3ac662fc 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -232,7 +232,7 @@ func BenchmarkMultiExpG1(b *testing.B) { copy(sampleScalarsSmallValues[:], sampleScalars[:]) copy(sampleScalarsRedundant[:], sampleScalars[:]) - // should split the scalars + // this means first chunk is going to have more work to do and should be split into several go routines for i := 0; i < len(sampleScalarsSmallValues); i++ { if i%5 == 0 { sampleScalarsSmallValues[i].SetZero() @@ -241,8 +241,8 @@ func BenchmarkMultiExpG1(b *testing.B) { } // bad case for batch affine - for i := 0; i < len(sampleScalarsRedundant); i += 10 { - for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] } } @@ -547,7 +547,7 @@ func BenchmarkMultiExpG2(b *testing.B) { copy(sampleScalarsSmallValues[:], sampleScalars[:]) copy(sampleScalarsRedundant[:], sampleScalars[:]) - // should split the scalars + // this means first chunk is going to have more work to do and should be split into several go routines for i := 0; i < len(sampleScalarsSmallValues); i++ { if i%5 == 0 { sampleScalarsSmallValues[i].SetZero() @@ -556,8 +556,8 @@ func BenchmarkMultiExpG2(b *testing.B) { } // bad case for batch affine - for i := 0; i < len(sampleScalarsRedundant); i += 10 { - for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] } } diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index 6c169ffc9c..e0b3958b87 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -232,7 +232,7 @@ func BenchmarkMultiExpG1(b *testing.B) { copy(sampleScalarsSmallValues[:], sampleScalars[:]) copy(sampleScalarsRedundant[:], sampleScalars[:]) - // should split the scalars + // this means first chunk is going to have more work to do and should be split into several go routines for i := 0; i < len(sampleScalarsSmallValues); i++ { if i%5 == 0 { sampleScalarsSmallValues[i].SetZero() @@ -241,8 +241,8 @@ func BenchmarkMultiExpG1(b *testing.B) { } // bad case for batch affine - for i := 0; i < len(sampleScalarsRedundant); i += 10 { - for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] } } @@ -547,7 +547,7 @@ func BenchmarkMultiExpG2(b *testing.B) { copy(sampleScalarsSmallValues[:], sampleScalars[:]) copy(sampleScalarsRedundant[:], sampleScalars[:]) - // should split the scalars + // this means first chunk is going to have more work to do and should be split into several go routines for i := 0; i < len(sampleScalarsSmallValues); i++ { if i%5 == 0 { sampleScalarsSmallValues[i].SetZero() @@ -556,8 +556,8 @@ func BenchmarkMultiExpG2(b *testing.B) { } // bad case for batch affine - for i := 0; i < len(sampleScalarsRedundant); i += 10 { - for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] } } diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index 5b36edf2ee..7bc0eacb61 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -232,7 +232,7 @@ func BenchmarkMultiExpG1(b *testing.B) { copy(sampleScalarsSmallValues[:], sampleScalars[:]) copy(sampleScalarsRedundant[:], sampleScalars[:]) - // should split the scalars + // this means first chunk is going to have more work to do and should be split into several go routines for i := 0; i < len(sampleScalarsSmallValues); i++ { if i%5 == 0 { sampleScalarsSmallValues[i].SetZero() @@ -241,8 +241,8 @@ func BenchmarkMultiExpG1(b *testing.B) { } // bad case for batch affine - for i := 0; i < len(sampleScalarsRedundant); i += 10 { - for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] } } @@ -547,7 +547,7 @@ func BenchmarkMultiExpG2(b *testing.B) { copy(sampleScalarsSmallValues[:], sampleScalars[:]) copy(sampleScalarsRedundant[:], sampleScalars[:]) - // should split the scalars + // this means first chunk is going to have more work to do and should be split into several go routines for i := 0; i < len(sampleScalarsSmallValues); i++ { if i%5 == 0 { sampleScalarsSmallValues[i].SetZero() @@ -556,8 +556,8 @@ func BenchmarkMultiExpG2(b *testing.B) { } // bad case for batch affine - for i := 0; i < len(sampleScalarsRedundant); i += 10 { - for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] } } diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index 67cfc85953..d5840336a4 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -232,7 +232,7 @@ func BenchmarkMultiExpG1(b *testing.B) { copy(sampleScalarsSmallValues[:], sampleScalars[:]) copy(sampleScalarsRedundant[:], sampleScalars[:]) - // should split the scalars + // this means first chunk is going to have more work to do and should be split into several go routines for i := 0; i < len(sampleScalarsSmallValues); i++ { if i%5 == 0 { sampleScalarsSmallValues[i].SetZero() @@ -241,8 +241,8 @@ func BenchmarkMultiExpG1(b *testing.B) { } // bad case for batch affine - for i := 0; i < len(sampleScalarsRedundant); i += 10 { - for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] } } @@ -547,7 +547,7 @@ func BenchmarkMultiExpG2(b *testing.B) { copy(sampleScalarsSmallValues[:], sampleScalars[:]) copy(sampleScalarsRedundant[:], sampleScalars[:]) - // should split the scalars + // this means first chunk is going to have more work to do and should be split into several go routines for i := 0; i < len(sampleScalarsSmallValues); i++ { if i%5 == 0 { sampleScalarsSmallValues[i].SetZero() @@ -556,8 +556,8 @@ func BenchmarkMultiExpG2(b *testing.B) { } // bad case for batch affine - for i := 0; i < len(sampleScalarsRedundant); i += 10 { - for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] } } diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index a65f05c5c2..244ae19386 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -232,7 +232,7 @@ func BenchmarkMultiExpG1(b *testing.B) { copy(sampleScalarsSmallValues[:], sampleScalars[:]) copy(sampleScalarsRedundant[:], sampleScalars[:]) - // should split the scalars + // this means first chunk is going to have more work to do and should be split into several go routines for i := 0; i < len(sampleScalarsSmallValues); i++ { if i%5 == 0 { sampleScalarsSmallValues[i].SetZero() @@ -241,8 +241,8 @@ func BenchmarkMultiExpG1(b *testing.B) { } // bad case for batch affine - for i := 0; i < len(sampleScalarsRedundant); i += 10 { - for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] } } @@ -547,7 +547,7 @@ func BenchmarkMultiExpG2(b *testing.B) { copy(sampleScalarsSmallValues[:], sampleScalars[:]) copy(sampleScalarsRedundant[:], sampleScalars[:]) - // should split the scalars + // this means first chunk is going to have more work to do and should be split into several go routines for i := 0; i < len(sampleScalarsSmallValues); i++ { if i%5 == 0 { sampleScalarsSmallValues[i].SetZero() @@ -556,8 +556,8 @@ func BenchmarkMultiExpG2(b *testing.B) { } // bad case for batch affine - for i := 0; i < len(sampleScalarsRedundant); i += 10 { - for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] } } diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index 5b81aea368..2a1f0cda97 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -232,7 +232,7 @@ func BenchmarkMultiExpG1(b *testing.B) { copy(sampleScalarsSmallValues[:], sampleScalars[:]) copy(sampleScalarsRedundant[:], sampleScalars[:]) - // should split the scalars + // this means first chunk is going to have more work to do and should be split into several go routines for i := 0; i < len(sampleScalarsSmallValues); i++ { if i%5 == 0 { sampleScalarsSmallValues[i].SetZero() @@ -241,8 +241,8 @@ func BenchmarkMultiExpG1(b *testing.B) { } // bad case for batch affine - for i := 0; i < len(sampleScalarsRedundant); i += 10 { - for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] } } @@ -547,7 +547,7 @@ func BenchmarkMultiExpG2(b *testing.B) { copy(sampleScalarsSmallValues[:], sampleScalars[:]) copy(sampleScalarsRedundant[:], sampleScalars[:]) - // should split the scalars + // this means first chunk is going to have more work to do and should be split into several go routines for i := 0; i < len(sampleScalarsSmallValues); i++ { if i%5 == 0 { sampleScalarsSmallValues[i].SetZero() @@ -556,8 +556,8 @@ func BenchmarkMultiExpG2(b *testing.B) { } // bad case for batch affine - for i := 0; i < len(sampleScalarsRedundant); i += 10 { - for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] } } diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index 38283453c4..589464949f 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -232,7 +232,7 @@ func BenchmarkMultiExpG1(b *testing.B) { copy(sampleScalarsSmallValues[:], sampleScalars[:]) copy(sampleScalarsRedundant[:], sampleScalars[:]) - // should split the scalars + // this means first chunk is going to have more work to do and should be split into several go routines for i := 0; i < len(sampleScalarsSmallValues); i++ { if i%5 == 0 { sampleScalarsSmallValues[i].SetZero() @@ -241,8 +241,8 @@ func BenchmarkMultiExpG1(b *testing.B) { } // bad case for batch affine - for i := 0; i < len(sampleScalarsRedundant); i += 10 { - for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] } } @@ -547,7 +547,7 @@ func BenchmarkMultiExpG2(b *testing.B) { copy(sampleScalarsSmallValues[:], sampleScalars[:]) copy(sampleScalarsRedundant[:], sampleScalars[:]) - // should split the scalars + // this means first chunk is going to have more work to do and should be split into several go routines for i := 0; i < len(sampleScalarsSmallValues); i++ { if i%5 == 0 { sampleScalarsSmallValues[i].SetZero() @@ -556,8 +556,8 @@ func BenchmarkMultiExpG2(b *testing.B) { } // bad case for batch affine - for i := 0; i < len(sampleScalarsRedundant); i += 10 { - for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ { + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] } } diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index 5e0298d575..9b259ebca4 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -245,7 +245,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { copy(sampleScalarsSmallValues[:],sampleScalars[:]) copy(sampleScalarsRedundant[:],sampleScalars[:]) - // should split the scalars + // this means first chunk is going to have more work to do and should be split into several go routines for i:=0; i < len(sampleScalarsSmallValues);i++ { if i % 5 == 0 { sampleScalarsSmallValues[i].SetZero() @@ -254,8 +254,8 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { } // bad case for batch affine - for i:=0; i < len(sampleScalarsRedundant);i+=10 { - for j:=i+1; j < i+10 && j < len(sampleScalarsRedundant);j++ { + for i:=0; i < len(sampleScalarsRedundant);i+=100 { + for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] } } From 52191c989a294085bcba460c35a2f33b82bd52da Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 15 Nov 2022 14:30:35 -0600 Subject: [PATCH 27/43] feat: start to add statistics when parsing scalars in msm --- ecc/bls12-377/multiexp.go | 170 ++++++++++++++---- ecc/bls12-377/multiexp_test.go | 8 +- ecc/bls12-378/multiexp.go | 170 ++++++++++++++---- ecc/bls12-378/multiexp_test.go | 8 +- ecc/bls12-381/multiexp.go | 170 ++++++++++++++---- ecc/bls12-381/multiexp_test.go | 8 +- ecc/bls24-315/multiexp.go | 170 ++++++++++++++---- ecc/bls24-315/multiexp_test.go | 8 +- ecc/bls24-317/multiexp.go | 170 ++++++++++++++---- ecc/bls24-317/multiexp_test.go | 8 +- ecc/bn254/multiexp.go | 170 ++++++++++++++---- ecc/bn254/multiexp_test.go | 8 +- ecc/bw6-633/multiexp.go | 122 ++++++++++--- ecc/bw6-633/multiexp_test.go | 8 +- ecc/bw6-756/multiexp.go | 122 ++++++++++--- ecc/bw6-756/multiexp_test.go | 8 +- ecc/bw6-761/multiexp.go | 122 ++++++++++--- ecc/bw6-761/multiexp_test.go | 8 +- .../generator/ecc/template/multiexp.go.tmpl | 106 +++++++++-- .../ecc/template/tests/multiexp.go.tmpl | 4 +- 20 files changed, 1216 insertions(+), 352 deletions(-) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index b04f523cff..fdd6b005bf 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -136,8 +136,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { - mustBeExt := false +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { case 4: @@ -153,37 +152,51 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu case 9: return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - if mustBeExt { + const batchSize = 80 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC10] } return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - if mustBeExt { + const batchSize = 150 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC11] } return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - if mustBeExt { + const batchSize = 200 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC12] } return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - if mustBeExt { + const batchSize = 350 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC13] } return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - if mustBeExt { + const batchSize = 400 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC14] } return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - if mustBeExt { + const batchSize = 500 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC15] } return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - if mustBeExt { + const batchSize = 640 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC16] } return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] @@ -212,24 +225,24 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - processChunk := getChunkProcessorG1(c) + processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { - processChunk = getChunkProcessorG1(lastC(c)) + processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) } - if chunkStats[j].weight >= 150.0 { + if chunkStats[j].weight >= 115 { // we split this in more go routines since this chunk has more work to do than the others. // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func() { + go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[j] <- s1 - }() + chChunks[chunkID] <- s1 + }(j) continue } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) @@ -365,8 +378,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { - mustBeExt := false +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { case 4: @@ -382,37 +394,51 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu case 9: return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - if mustBeExt { + const batchSize = 80 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC10] } return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - if mustBeExt { + const batchSize = 150 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC11] } return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - if mustBeExt { + const batchSize = 200 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC12] } return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - if mustBeExt { + const batchSize = 350 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC13] } return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - if mustBeExt { + const batchSize = 400 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC14] } return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - if mustBeExt { + const batchSize = 500 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC15] } return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - if mustBeExt { + const batchSize = 640 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC16] } return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] @@ -441,24 +467,24 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - processChunk := getChunkProcessorG2(c) + processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { - processChunk = getChunkProcessorG2(lastC(c)) + processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) } - if chunkStats[j].weight >= 150.0 { + if chunkStats[j].weight >= 115 { // we split this in more go routines since this chunk has more work to do than the others. // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func() { + go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[j] <- s1 - }() + chChunks[chunkID] <- s1 + }(j) continue } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) @@ -518,7 +544,18 @@ func lastC(c uint64) uint64 { } type chunkStat struct { - weight float32 // relative weight compared to other chunks. + // relative weight of work compared to other chunks. 100.0 -> nominal weight. + weight int + + // average absolute deviation. this is meant to give a sense of statistical + // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + deviation int + + // count the number of buckets that are non zeroes for this chunk + nonZeroBuckets int + + // average ops per non-zero buckets + averageOpsPerBucket int } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits @@ -555,9 +592,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk := make(chan []int, nbTasks) + chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { opsPerChunk := make([]int, nbChunks) + opsPerBucketPerChunk := make([][]int, nbChunks) + for i := 0; i < len(opsPerBucketPerChunk); i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets + } for i := start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -593,16 +635,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint16 - // if digit is zero, no impact on result if digit == 0 { continue } + + var bits uint16 if digit > 0 { bits = uint16(digit) << 1 + opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 + opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits opsPerChunk[chunk]++ @@ -610,24 +654,72 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk <- opsPerChunk + chOpsPerBucketPerChunk <- opsPerBucketPerChunk }, nbTasks) // aggregate chunk stats + chunkStats := make([]chunkStat, nbChunks) close(chOpsPerChunk) + close(chOpsPerBucketPerChunk) opsPerChunk := make([]int, nbChunks) totalOps := 0 - for o := range chOpsPerChunk { - for i, nbOps := range o { + for chunks := range chOpsPerChunk { + for i, nbOps := range chunks { opsPerChunk[i] += nbOps totalOps += nbOps } } - chunkStats := make([]chunkStat, nbChunks) - target := float32(totalOps) / float32(nbChunks) + + opsPerBucketPerChunk := make([][]int, nbChunks) + for i := 0; i < len(opsPerBucketPerChunk); i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets + } + for chunks := range chOpsPerBucketPerChunk { + for i, opsPerBucket := range chunks { + for j, o := range opsPerBucket { + // bucket j in chunk i has o operations + if opsPerBucketPerChunk[i][j] == 0 && o != 0 { + chunkStats[i].nonZeroBuckets++ + } + opsPerBucketPerChunk[i][j] += o + } + } + } + + abs := func(v int) int { + if v < 0 { + return -v + } + return v + } + + // we know the total ops for the chunk, the number of non zero buckets + // so we can compute the deviation; + // TODO @gbotrel do that in go routines + for chunkID := 0; chunkID < len(chunkStats); chunkID++ { + nz := chunkStats[chunkID].nonZeroBuckets + if nz == 0 { + continue // ignore chunk, full of zeroes. + } + mean := opsPerChunk[chunkID] / nz + aad := 0 + averageOpsPerBucket := 0 + for _, bucketOps := range opsPerBucketPerChunk[chunkID] { + aad += abs(bucketOps - mean) + averageOpsPerBucket += bucketOps + } + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz + } + + target := totalOps / int(nbChunks) // what percentage are you of the target - for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + if target != 0 { + // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = opsPerChunk[i] * 100 / target + } } return digits, chunkStats diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index 6425235853..3f5ea45edd 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -240,7 +240,9 @@ func BenchmarkMultiExpG1(b *testing.B) { } } - // bad case for batch affine + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. for i := 0; i < len(sampleScalarsRedundant); i += 100 { for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] @@ -555,7 +557,9 @@ func BenchmarkMultiExpG2(b *testing.B) { } } - // bad case for batch affine + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. for i := 0; i < len(sampleScalarsRedundant); i += 100 { for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index cd49deb3de..711a527770 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -136,8 +136,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { - mustBeExt := false +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { case 4: @@ -153,37 +152,51 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu case 9: return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - if mustBeExt { + const batchSize = 80 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC10] } return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - if mustBeExt { + const batchSize = 150 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC11] } return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - if mustBeExt { + const batchSize = 200 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC12] } return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - if mustBeExt { + const batchSize = 350 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC13] } return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - if mustBeExt { + const batchSize = 400 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC14] } return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - if mustBeExt { + const batchSize = 500 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC15] } return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - if mustBeExt { + const batchSize = 640 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC16] } return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] @@ -212,24 +225,24 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - processChunk := getChunkProcessorG1(c) + processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { - processChunk = getChunkProcessorG1(lastC(c)) + processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) } - if chunkStats[j].weight >= 150.0 { + if chunkStats[j].weight >= 115 { // we split this in more go routines since this chunk has more work to do than the others. // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func() { + go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[j] <- s1 - }() + chChunks[chunkID] <- s1 + }(j) continue } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) @@ -365,8 +378,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { - mustBeExt := false +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { case 4: @@ -382,37 +394,51 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu case 9: return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - if mustBeExt { + const batchSize = 80 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC10] } return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - if mustBeExt { + const batchSize = 150 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC11] } return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - if mustBeExt { + const batchSize = 200 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC12] } return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - if mustBeExt { + const batchSize = 350 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC13] } return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - if mustBeExt { + const batchSize = 400 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC14] } return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - if mustBeExt { + const batchSize = 500 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC15] } return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - if mustBeExt { + const batchSize = 640 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC16] } return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] @@ -441,24 +467,24 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - processChunk := getChunkProcessorG2(c) + processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { - processChunk = getChunkProcessorG2(lastC(c)) + processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) } - if chunkStats[j].weight >= 150.0 { + if chunkStats[j].weight >= 115 { // we split this in more go routines since this chunk has more work to do than the others. // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func() { + go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[j] <- s1 - }() + chChunks[chunkID] <- s1 + }(j) continue } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) @@ -518,7 +544,18 @@ func lastC(c uint64) uint64 { } type chunkStat struct { - weight float32 // relative weight compared to other chunks. + // relative weight of work compared to other chunks. 100.0 -> nominal weight. + weight int + + // average absolute deviation. this is meant to give a sense of statistical + // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + deviation int + + // count the number of buckets that are non zeroes for this chunk + nonZeroBuckets int + + // average ops per non-zero buckets + averageOpsPerBucket int } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits @@ -555,9 +592,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk := make(chan []int, nbTasks) + chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { opsPerChunk := make([]int, nbChunks) + opsPerBucketPerChunk := make([][]int, nbChunks) + for i := 0; i < len(opsPerBucketPerChunk); i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets + } for i := start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -593,16 +635,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint16 - // if digit is zero, no impact on result if digit == 0 { continue } + + var bits uint16 if digit > 0 { bits = uint16(digit) << 1 + opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 + opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits opsPerChunk[chunk]++ @@ -610,24 +654,72 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk <- opsPerChunk + chOpsPerBucketPerChunk <- opsPerBucketPerChunk }, nbTasks) // aggregate chunk stats + chunkStats := make([]chunkStat, nbChunks) close(chOpsPerChunk) + close(chOpsPerBucketPerChunk) opsPerChunk := make([]int, nbChunks) totalOps := 0 - for o := range chOpsPerChunk { - for i, nbOps := range o { + for chunks := range chOpsPerChunk { + for i, nbOps := range chunks { opsPerChunk[i] += nbOps totalOps += nbOps } } - chunkStats := make([]chunkStat, nbChunks) - target := float32(totalOps) / float32(nbChunks) + + opsPerBucketPerChunk := make([][]int, nbChunks) + for i := 0; i < len(opsPerBucketPerChunk); i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets + } + for chunks := range chOpsPerBucketPerChunk { + for i, opsPerBucket := range chunks { + for j, o := range opsPerBucket { + // bucket j in chunk i has o operations + if opsPerBucketPerChunk[i][j] == 0 && o != 0 { + chunkStats[i].nonZeroBuckets++ + } + opsPerBucketPerChunk[i][j] += o + } + } + } + + abs := func(v int) int { + if v < 0 { + return -v + } + return v + } + + // we know the total ops for the chunk, the number of non zero buckets + // so we can compute the deviation; + // TODO @gbotrel do that in go routines + for chunkID := 0; chunkID < len(chunkStats); chunkID++ { + nz := chunkStats[chunkID].nonZeroBuckets + if nz == 0 { + continue // ignore chunk, full of zeroes. + } + mean := opsPerChunk[chunkID] / nz + aad := 0 + averageOpsPerBucket := 0 + for _, bucketOps := range opsPerBucketPerChunk[chunkID] { + aad += abs(bucketOps - mean) + averageOpsPerBucket += bucketOps + } + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz + } + + target := totalOps / int(nbChunks) // what percentage are you of the target - for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + if target != 0 { + // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = opsPerChunk[i] * 100 / target + } } return digits, chunkStats diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index 1b586aba7e..fd3aee65e6 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -240,7 +240,9 @@ func BenchmarkMultiExpG1(b *testing.B) { } } - // bad case for batch affine + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. for i := 0; i < len(sampleScalarsRedundant); i += 100 { for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] @@ -555,7 +557,9 @@ func BenchmarkMultiExpG2(b *testing.B) { } } - // bad case for batch affine + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. for i := 0; i < len(sampleScalarsRedundant); i += 100 { for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index 4d7a2e07f5..7402fa0c62 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -136,8 +136,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { - mustBeExt := false +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { case 4: @@ -153,37 +152,51 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu case 9: return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - if mustBeExt { + const batchSize = 80 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC10] } return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - if mustBeExt { + const batchSize = 150 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC11] } return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - if mustBeExt { + const batchSize = 200 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC12] } return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - if mustBeExt { + const batchSize = 350 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC13] } return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - if mustBeExt { + const batchSize = 400 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC14] } return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - if mustBeExt { + const batchSize = 500 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC15] } return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - if mustBeExt { + const batchSize = 640 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC16] } return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] @@ -212,24 +225,24 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - processChunk := getChunkProcessorG1(c) + processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { - processChunk = getChunkProcessorG1(lastC(c)) + processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) } - if chunkStats[j].weight >= 150.0 { + if chunkStats[j].weight >= 115 { // we split this in more go routines since this chunk has more work to do than the others. // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func() { + go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[j] <- s1 - }() + chChunks[chunkID] <- s1 + }(j) continue } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) @@ -365,8 +378,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { - mustBeExt := false +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { case 4: @@ -382,37 +394,51 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu case 9: return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - if mustBeExt { + const batchSize = 80 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC10] } return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - if mustBeExt { + const batchSize = 150 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC11] } return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - if mustBeExt { + const batchSize = 200 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC12] } return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - if mustBeExt { + const batchSize = 350 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC13] } return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - if mustBeExt { + const batchSize = 400 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC14] } return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - if mustBeExt { + const batchSize = 500 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC15] } return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - if mustBeExt { + const batchSize = 640 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC16] } return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] @@ -441,24 +467,24 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - processChunk := getChunkProcessorG2(c) + processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { - processChunk = getChunkProcessorG2(lastC(c)) + processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) } - if chunkStats[j].weight >= 150.0 { + if chunkStats[j].weight >= 115 { // we split this in more go routines since this chunk has more work to do than the others. // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func() { + go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[j] <- s1 - }() + chChunks[chunkID] <- s1 + }(j) continue } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) @@ -518,7 +544,18 @@ func lastC(c uint64) uint64 { } type chunkStat struct { - weight float32 // relative weight compared to other chunks. + // relative weight of work compared to other chunks. 100.0 -> nominal weight. + weight int + + // average absolute deviation. this is meant to give a sense of statistical + // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + deviation int + + // count the number of buckets that are non zeroes for this chunk + nonZeroBuckets int + + // average ops per non-zero buckets + averageOpsPerBucket int } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits @@ -555,9 +592,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk := make(chan []int, nbTasks) + chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { opsPerChunk := make([]int, nbChunks) + opsPerBucketPerChunk := make([][]int, nbChunks) + for i := 0; i < len(opsPerBucketPerChunk); i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets + } for i := start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -593,16 +635,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint16 - // if digit is zero, no impact on result if digit == 0 { continue } + + var bits uint16 if digit > 0 { bits = uint16(digit) << 1 + opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 + opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits opsPerChunk[chunk]++ @@ -610,24 +654,72 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk <- opsPerChunk + chOpsPerBucketPerChunk <- opsPerBucketPerChunk }, nbTasks) // aggregate chunk stats + chunkStats := make([]chunkStat, nbChunks) close(chOpsPerChunk) + close(chOpsPerBucketPerChunk) opsPerChunk := make([]int, nbChunks) totalOps := 0 - for o := range chOpsPerChunk { - for i, nbOps := range o { + for chunks := range chOpsPerChunk { + for i, nbOps := range chunks { opsPerChunk[i] += nbOps totalOps += nbOps } } - chunkStats := make([]chunkStat, nbChunks) - target := float32(totalOps) / float32(nbChunks) + + opsPerBucketPerChunk := make([][]int, nbChunks) + for i := 0; i < len(opsPerBucketPerChunk); i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets + } + for chunks := range chOpsPerBucketPerChunk { + for i, opsPerBucket := range chunks { + for j, o := range opsPerBucket { + // bucket j in chunk i has o operations + if opsPerBucketPerChunk[i][j] == 0 && o != 0 { + chunkStats[i].nonZeroBuckets++ + } + opsPerBucketPerChunk[i][j] += o + } + } + } + + abs := func(v int) int { + if v < 0 { + return -v + } + return v + } + + // we know the total ops for the chunk, the number of non zero buckets + // so we can compute the deviation; + // TODO @gbotrel do that in go routines + for chunkID := 0; chunkID < len(chunkStats); chunkID++ { + nz := chunkStats[chunkID].nonZeroBuckets + if nz == 0 { + continue // ignore chunk, full of zeroes. + } + mean := opsPerChunk[chunkID] / nz + aad := 0 + averageOpsPerBucket := 0 + for _, bucketOps := range opsPerBucketPerChunk[chunkID] { + aad += abs(bucketOps - mean) + averageOpsPerBucket += bucketOps + } + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz + } + + target := totalOps / int(nbChunks) // what percentage are you of the target - for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + if target != 0 { + // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = opsPerChunk[i] * 100 / target + } } return digits, chunkStats diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index 2b3ac662fc..9ce352672c 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -240,7 +240,9 @@ func BenchmarkMultiExpG1(b *testing.B) { } } - // bad case for batch affine + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. for i := 0; i < len(sampleScalarsRedundant); i += 100 { for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] @@ -555,7 +557,9 @@ func BenchmarkMultiExpG2(b *testing.B) { } } - // bad case for batch affine + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. for i := 0; i < len(sampleScalarsRedundant); i += 100 { for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index 3f49fd1afd..f1b13bdfc1 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -136,8 +136,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { - mustBeExt := false +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { case 4: @@ -153,37 +152,51 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu case 9: return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - if mustBeExt { + const batchSize = 80 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC10] } return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - if mustBeExt { + const batchSize = 150 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC11] } return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - if mustBeExt { + const batchSize = 200 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC12] } return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - if mustBeExt { + const batchSize = 350 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC13] } return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - if mustBeExt { + const batchSize = 400 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC14] } return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - if mustBeExt { + const batchSize = 500 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC15] } return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - if mustBeExt { + const batchSize = 640 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC16] } return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] @@ -212,24 +225,24 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - processChunk := getChunkProcessorG1(c) + processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { - processChunk = getChunkProcessorG1(lastC(c)) + processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) } - if chunkStats[j].weight >= 150.0 { + if chunkStats[j].weight >= 115 { // we split this in more go routines since this chunk has more work to do than the others. // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func() { + go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[j] <- s1 - }() + chChunks[chunkID] <- s1 + }(j) continue } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) @@ -365,8 +378,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { - mustBeExt := false +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { case 4: @@ -382,37 +394,51 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu case 9: return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - if mustBeExt { + const batchSize = 80 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC10] } return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - if mustBeExt { + const batchSize = 150 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC11] } return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - if mustBeExt { + const batchSize = 200 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC12] } return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - if mustBeExt { + const batchSize = 350 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC13] } return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - if mustBeExt { + const batchSize = 400 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC14] } return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - if mustBeExt { + const batchSize = 500 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC15] } return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - if mustBeExt { + const batchSize = 640 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC16] } return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] @@ -441,24 +467,24 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - processChunk := getChunkProcessorG2(c) + processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { - processChunk = getChunkProcessorG2(lastC(c)) + processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) } - if chunkStats[j].weight >= 150.0 { + if chunkStats[j].weight >= 115 { // we split this in more go routines since this chunk has more work to do than the others. // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func() { + go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[j] <- s1 - }() + chChunks[chunkID] <- s1 + }(j) continue } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) @@ -518,7 +544,18 @@ func lastC(c uint64) uint64 { } type chunkStat struct { - weight float32 // relative weight compared to other chunks. + // relative weight of work compared to other chunks. 100.0 -> nominal weight. + weight int + + // average absolute deviation. this is meant to give a sense of statistical + // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + deviation int + + // count the number of buckets that are non zeroes for this chunk + nonZeroBuckets int + + // average ops per non-zero buckets + averageOpsPerBucket int } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits @@ -555,9 +592,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk := make(chan []int, nbTasks) + chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { opsPerChunk := make([]int, nbChunks) + opsPerBucketPerChunk := make([][]int, nbChunks) + for i := 0; i < len(opsPerBucketPerChunk); i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets + } for i := start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -593,16 +635,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint16 - // if digit is zero, no impact on result if digit == 0 { continue } + + var bits uint16 if digit > 0 { bits = uint16(digit) << 1 + opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 + opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits opsPerChunk[chunk]++ @@ -610,24 +654,72 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk <- opsPerChunk + chOpsPerBucketPerChunk <- opsPerBucketPerChunk }, nbTasks) // aggregate chunk stats + chunkStats := make([]chunkStat, nbChunks) close(chOpsPerChunk) + close(chOpsPerBucketPerChunk) opsPerChunk := make([]int, nbChunks) totalOps := 0 - for o := range chOpsPerChunk { - for i, nbOps := range o { + for chunks := range chOpsPerChunk { + for i, nbOps := range chunks { opsPerChunk[i] += nbOps totalOps += nbOps } } - chunkStats := make([]chunkStat, nbChunks) - target := float32(totalOps) / float32(nbChunks) + + opsPerBucketPerChunk := make([][]int, nbChunks) + for i := 0; i < len(opsPerBucketPerChunk); i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets + } + for chunks := range chOpsPerBucketPerChunk { + for i, opsPerBucket := range chunks { + for j, o := range opsPerBucket { + // bucket j in chunk i has o operations + if opsPerBucketPerChunk[i][j] == 0 && o != 0 { + chunkStats[i].nonZeroBuckets++ + } + opsPerBucketPerChunk[i][j] += o + } + } + } + + abs := func(v int) int { + if v < 0 { + return -v + } + return v + } + + // we know the total ops for the chunk, the number of non zero buckets + // so we can compute the deviation; + // TODO @gbotrel do that in go routines + for chunkID := 0; chunkID < len(chunkStats); chunkID++ { + nz := chunkStats[chunkID].nonZeroBuckets + if nz == 0 { + continue // ignore chunk, full of zeroes. + } + mean := opsPerChunk[chunkID] / nz + aad := 0 + averageOpsPerBucket := 0 + for _, bucketOps := range opsPerBucketPerChunk[chunkID] { + aad += abs(bucketOps - mean) + averageOpsPerBucket += bucketOps + } + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz + } + + target := totalOps / int(nbChunks) // what percentage are you of the target - for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + if target != 0 { + // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = opsPerChunk[i] * 100 / target + } } return digits, chunkStats diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index e0b3958b87..c40b9ccf21 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -240,7 +240,9 @@ func BenchmarkMultiExpG1(b *testing.B) { } } - // bad case for batch affine + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. for i := 0; i < len(sampleScalarsRedundant); i += 100 { for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] @@ -555,7 +557,9 @@ func BenchmarkMultiExpG2(b *testing.B) { } } - // bad case for batch affine + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. for i := 0; i < len(sampleScalarsRedundant); i += 100 { for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index 29548cc89c..e1c79c6ba5 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -136,8 +136,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { - mustBeExt := false +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { case 4: @@ -153,37 +152,51 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu case 9: return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - if mustBeExt { + const batchSize = 80 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC10] } return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - if mustBeExt { + const batchSize = 150 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC11] } return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - if mustBeExt { + const batchSize = 200 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC12] } return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - if mustBeExt { + const batchSize = 350 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC13] } return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - if mustBeExt { + const batchSize = 400 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC14] } return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - if mustBeExt { + const batchSize = 500 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC15] } return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - if mustBeExt { + const batchSize = 640 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC16] } return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] @@ -212,24 +225,24 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - processChunk := getChunkProcessorG1(c) + processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { - processChunk = getChunkProcessorG1(lastC(c)) + processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) } - if chunkStats[j].weight >= 150.0 { + if chunkStats[j].weight >= 115 { // we split this in more go routines since this chunk has more work to do than the others. // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func() { + go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[j] <- s1 - }() + chChunks[chunkID] <- s1 + }(j) continue } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) @@ -365,8 +378,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { - mustBeExt := false +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { case 4: @@ -382,37 +394,51 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu case 9: return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - if mustBeExt { + const batchSize = 80 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC10] } return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - if mustBeExt { + const batchSize = 150 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC11] } return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - if mustBeExt { + const batchSize = 200 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC12] } return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - if mustBeExt { + const batchSize = 350 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC13] } return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - if mustBeExt { + const batchSize = 400 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC14] } return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - if mustBeExt { + const batchSize = 500 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC15] } return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - if mustBeExt { + const batchSize = 640 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC16] } return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] @@ -441,24 +467,24 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - processChunk := getChunkProcessorG2(c) + processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { - processChunk = getChunkProcessorG2(lastC(c)) + processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) } - if chunkStats[j].weight >= 150.0 { + if chunkStats[j].weight >= 115 { // we split this in more go routines since this chunk has more work to do than the others. // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func() { + go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[j] <- s1 - }() + chChunks[chunkID] <- s1 + }(j) continue } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) @@ -518,7 +544,18 @@ func lastC(c uint64) uint64 { } type chunkStat struct { - weight float32 // relative weight compared to other chunks. + // relative weight of work compared to other chunks. 100.0 -> nominal weight. + weight int + + // average absolute deviation. this is meant to give a sense of statistical + // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + deviation int + + // count the number of buckets that are non zeroes for this chunk + nonZeroBuckets int + + // average ops per non-zero buckets + averageOpsPerBucket int } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits @@ -555,9 +592,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk := make(chan []int, nbTasks) + chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { opsPerChunk := make([]int, nbChunks) + opsPerBucketPerChunk := make([][]int, nbChunks) + for i := 0; i < len(opsPerBucketPerChunk); i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets + } for i := start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -593,16 +635,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint16 - // if digit is zero, no impact on result if digit == 0 { continue } + + var bits uint16 if digit > 0 { bits = uint16(digit) << 1 + opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 + opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits opsPerChunk[chunk]++ @@ -610,24 +654,72 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk <- opsPerChunk + chOpsPerBucketPerChunk <- opsPerBucketPerChunk }, nbTasks) // aggregate chunk stats + chunkStats := make([]chunkStat, nbChunks) close(chOpsPerChunk) + close(chOpsPerBucketPerChunk) opsPerChunk := make([]int, nbChunks) totalOps := 0 - for o := range chOpsPerChunk { - for i, nbOps := range o { + for chunks := range chOpsPerChunk { + for i, nbOps := range chunks { opsPerChunk[i] += nbOps totalOps += nbOps } } - chunkStats := make([]chunkStat, nbChunks) - target := float32(totalOps) / float32(nbChunks) + + opsPerBucketPerChunk := make([][]int, nbChunks) + for i := 0; i < len(opsPerBucketPerChunk); i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets + } + for chunks := range chOpsPerBucketPerChunk { + for i, opsPerBucket := range chunks { + for j, o := range opsPerBucket { + // bucket j in chunk i has o operations + if opsPerBucketPerChunk[i][j] == 0 && o != 0 { + chunkStats[i].nonZeroBuckets++ + } + opsPerBucketPerChunk[i][j] += o + } + } + } + + abs := func(v int) int { + if v < 0 { + return -v + } + return v + } + + // we know the total ops for the chunk, the number of non zero buckets + // so we can compute the deviation; + // TODO @gbotrel do that in go routines + for chunkID := 0; chunkID < len(chunkStats); chunkID++ { + nz := chunkStats[chunkID].nonZeroBuckets + if nz == 0 { + continue // ignore chunk, full of zeroes. + } + mean := opsPerChunk[chunkID] / nz + aad := 0 + averageOpsPerBucket := 0 + for _, bucketOps := range opsPerBucketPerChunk[chunkID] { + aad += abs(bucketOps - mean) + averageOpsPerBucket += bucketOps + } + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz + } + + target := totalOps / int(nbChunks) // what percentage are you of the target - for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + if target != 0 { + // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = opsPerChunk[i] * 100 / target + } } return digits, chunkStats diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index 7bc0eacb61..87caea0886 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -240,7 +240,9 @@ func BenchmarkMultiExpG1(b *testing.B) { } } - // bad case for batch affine + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. for i := 0; i < len(sampleScalarsRedundant); i += 100 { for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] @@ -555,7 +557,9 @@ func BenchmarkMultiExpG2(b *testing.B) { } } - // bad case for batch affine + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. for i := 0; i < len(sampleScalarsRedundant); i += 100 { for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index 87cee65e19..0fe60e8201 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -136,8 +136,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { - mustBeExt := false +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { case 4: @@ -153,37 +152,51 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu case 9: return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - if mustBeExt { + const batchSize = 80 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC10] } return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - if mustBeExt { + const batchSize = 150 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC11] } return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - if mustBeExt { + const batchSize = 200 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC12] } return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - if mustBeExt { + const batchSize = 350 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC13] } return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - if mustBeExt { + const batchSize = 400 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC14] } return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - if mustBeExt { + const batchSize = 500 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC15] } return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - if mustBeExt { + const batchSize = 640 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC16] } return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] @@ -212,24 +225,24 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - processChunk := getChunkProcessorG1(c) + processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { - processChunk = getChunkProcessorG1(lastC(c)) + processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) } - if chunkStats[j].weight >= 150.0 { + if chunkStats[j].weight >= 115 { // we split this in more go routines since this chunk has more work to do than the others. // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func() { + go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[j] <- s1 - }() + chChunks[chunkID] <- s1 + }(j) continue } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) @@ -365,8 +378,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { - mustBeExt := false +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { case 4: @@ -382,37 +394,51 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu case 9: return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - if mustBeExt { + const batchSize = 80 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC10] } return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - if mustBeExt { + const batchSize = 150 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC11] } return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - if mustBeExt { + const batchSize = 200 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC12] } return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - if mustBeExt { + const batchSize = 350 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC13] } return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - if mustBeExt { + const batchSize = 400 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC14] } return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - if mustBeExt { + const batchSize = 500 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC15] } return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - if mustBeExt { + const batchSize = 640 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC16] } return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] @@ -441,24 +467,24 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - processChunk := getChunkProcessorG2(c) + processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { - processChunk = getChunkProcessorG2(lastC(c)) + processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) } - if chunkStats[j].weight >= 150.0 { + if chunkStats[j].weight >= 115 { // we split this in more go routines since this chunk has more work to do than the others. // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func() { + go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[j] <- s1 - }() + chChunks[chunkID] <- s1 + }(j) continue } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) @@ -518,7 +544,18 @@ func lastC(c uint64) uint64 { } type chunkStat struct { - weight float32 // relative weight compared to other chunks. + // relative weight of work compared to other chunks. 100.0 -> nominal weight. + weight int + + // average absolute deviation. this is meant to give a sense of statistical + // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + deviation int + + // count the number of buckets that are non zeroes for this chunk + nonZeroBuckets int + + // average ops per non-zero buckets + averageOpsPerBucket int } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits @@ -555,9 +592,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk := make(chan []int, nbTasks) + chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { opsPerChunk := make([]int, nbChunks) + opsPerBucketPerChunk := make([][]int, nbChunks) + for i := 0; i < len(opsPerBucketPerChunk); i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets + } for i := start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -593,16 +635,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint16 - // if digit is zero, no impact on result if digit == 0 { continue } + + var bits uint16 if digit > 0 { bits = uint16(digit) << 1 + opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 + opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits opsPerChunk[chunk]++ @@ -610,24 +654,72 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk <- opsPerChunk + chOpsPerBucketPerChunk <- opsPerBucketPerChunk }, nbTasks) // aggregate chunk stats + chunkStats := make([]chunkStat, nbChunks) close(chOpsPerChunk) + close(chOpsPerBucketPerChunk) opsPerChunk := make([]int, nbChunks) totalOps := 0 - for o := range chOpsPerChunk { - for i, nbOps := range o { + for chunks := range chOpsPerChunk { + for i, nbOps := range chunks { opsPerChunk[i] += nbOps totalOps += nbOps } } - chunkStats := make([]chunkStat, nbChunks) - target := float32(totalOps) / float32(nbChunks) + + opsPerBucketPerChunk := make([][]int, nbChunks) + for i := 0; i < len(opsPerBucketPerChunk); i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets + } + for chunks := range chOpsPerBucketPerChunk { + for i, opsPerBucket := range chunks { + for j, o := range opsPerBucket { + // bucket j in chunk i has o operations + if opsPerBucketPerChunk[i][j] == 0 && o != 0 { + chunkStats[i].nonZeroBuckets++ + } + opsPerBucketPerChunk[i][j] += o + } + } + } + + abs := func(v int) int { + if v < 0 { + return -v + } + return v + } + + // we know the total ops for the chunk, the number of non zero buckets + // so we can compute the deviation; + // TODO @gbotrel do that in go routines + for chunkID := 0; chunkID < len(chunkStats); chunkID++ { + nz := chunkStats[chunkID].nonZeroBuckets + if nz == 0 { + continue // ignore chunk, full of zeroes. + } + mean := opsPerChunk[chunkID] / nz + aad := 0 + averageOpsPerBucket := 0 + for _, bucketOps := range opsPerBucketPerChunk[chunkID] { + aad += abs(bucketOps - mean) + averageOpsPerBucket += bucketOps + } + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz + } + + target := totalOps / int(nbChunks) // what percentage are you of the target - for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + if target != 0 { + // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = opsPerChunk[i] * 100 / target + } } return digits, chunkStats diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index d5840336a4..378662496e 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -240,7 +240,9 @@ func BenchmarkMultiExpG1(b *testing.B) { } } - // bad case for batch affine + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. for i := 0; i < len(sampleScalarsRedundant); i += 100 { for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] @@ -555,7 +557,9 @@ func BenchmarkMultiExpG2(b *testing.B) { } } - // bad case for batch affine + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. for i := 0; i < len(sampleScalarsRedundant); i += 100 { for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index d77d85346f..c8d0d6518e 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -136,8 +136,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { - mustBeExt := false +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { case 4: @@ -147,7 +146,9 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu case 8: return processChunkG1Jacobian[bucketg1JacExtendedC8] case 16: - if mustBeExt { + const batchSize = 640 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC16] } return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] @@ -176,24 +177,24 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - processChunk := getChunkProcessorG1(c) + processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { - processChunk = getChunkProcessorG1(lastC(c)) + processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) } - if chunkStats[j].weight >= 150.0 { + if chunkStats[j].weight >= 115 { // we split this in more go routines since this chunk has more work to do than the others. // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func() { + go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[j] <- s1 - }() + chChunks[chunkID] <- s1 + }(j) continue } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) @@ -329,8 +330,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { - mustBeExt := false +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { case 4: @@ -340,7 +340,9 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu case 8: return processChunkG2Jacobian[bucketg2JacExtendedC8] case 16: - if mustBeExt { + const batchSize = 640 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC16] } return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] @@ -369,24 +371,24 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - processChunk := getChunkProcessorG2(c) + processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { - processChunk = getChunkProcessorG2(lastC(c)) + processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) } - if chunkStats[j].weight >= 150.0 { + if chunkStats[j].weight >= 115 { // we split this in more go routines since this chunk has more work to do than the others. // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func() { + go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[j] <- s1 - }() + chChunks[chunkID] <- s1 + }(j) continue } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) @@ -446,7 +448,18 @@ func lastC(c uint64) uint64 { } type chunkStat struct { - weight float32 // relative weight compared to other chunks. + // relative weight of work compared to other chunks. 100.0 -> nominal weight. + weight int + + // average absolute deviation. this is meant to give a sense of statistical + // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + deviation int + + // count the number of buckets that are non zeroes for this chunk + nonZeroBuckets int + + // average ops per non-zero buckets + averageOpsPerBucket int } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits @@ -483,9 +496,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk := make(chan []int, nbTasks) + chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { opsPerChunk := make([]int, nbChunks) + opsPerBucketPerChunk := make([][]int, nbChunks) + for i := 0; i < len(opsPerBucketPerChunk); i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets + } for i := start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -521,16 +539,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint16 - // if digit is zero, no impact on result if digit == 0 { continue } + + var bits uint16 if digit > 0 { bits = uint16(digit) << 1 + opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 + opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits opsPerChunk[chunk]++ @@ -538,24 +558,72 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk <- opsPerChunk + chOpsPerBucketPerChunk <- opsPerBucketPerChunk }, nbTasks) // aggregate chunk stats + chunkStats := make([]chunkStat, nbChunks) close(chOpsPerChunk) + close(chOpsPerBucketPerChunk) opsPerChunk := make([]int, nbChunks) totalOps := 0 - for o := range chOpsPerChunk { - for i, nbOps := range o { + for chunks := range chOpsPerChunk { + for i, nbOps := range chunks { opsPerChunk[i] += nbOps totalOps += nbOps } } - chunkStats := make([]chunkStat, nbChunks) - target := float32(totalOps) / float32(nbChunks) + + opsPerBucketPerChunk := make([][]int, nbChunks) + for i := 0; i < len(opsPerBucketPerChunk); i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets + } + for chunks := range chOpsPerBucketPerChunk { + for i, opsPerBucket := range chunks { + for j, o := range opsPerBucket { + // bucket j in chunk i has o operations + if opsPerBucketPerChunk[i][j] == 0 && o != 0 { + chunkStats[i].nonZeroBuckets++ + } + opsPerBucketPerChunk[i][j] += o + } + } + } + + abs := func(v int) int { + if v < 0 { + return -v + } + return v + } + + // we know the total ops for the chunk, the number of non zero buckets + // so we can compute the deviation; + // TODO @gbotrel do that in go routines + for chunkID := 0; chunkID < len(chunkStats); chunkID++ { + nz := chunkStats[chunkID].nonZeroBuckets + if nz == 0 { + continue // ignore chunk, full of zeroes. + } + mean := opsPerChunk[chunkID] / nz + aad := 0 + averageOpsPerBucket := 0 + for _, bucketOps := range opsPerBucketPerChunk[chunkID] { + aad += abs(bucketOps - mean) + averageOpsPerBucket += bucketOps + } + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz + } + + target := totalOps / int(nbChunks) // what percentage are you of the target - for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + if target != 0 { + // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = opsPerChunk[i] * 100 / target + } } return digits, chunkStats diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index 244ae19386..622c98195b 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -240,7 +240,9 @@ func BenchmarkMultiExpG1(b *testing.B) { } } - // bad case for batch affine + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. for i := 0; i < len(sampleScalarsRedundant); i += 100 { for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] @@ -555,7 +557,9 @@ func BenchmarkMultiExpG2(b *testing.B) { } } - // bad case for batch affine + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. for i := 0; i < len(sampleScalarsRedundant); i += 100 { for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index ae313e1d52..72a0b35b7b 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -136,8 +136,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { - mustBeExt := false +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { case 4: @@ -147,7 +146,9 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu case 8: return processChunkG1Jacobian[bucketg1JacExtendedC8] case 16: - if mustBeExt { + const batchSize = 640 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC16] } return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] @@ -176,24 +177,24 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - processChunk := getChunkProcessorG1(c) + processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { - processChunk = getChunkProcessorG1(lastC(c)) + processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) } - if chunkStats[j].weight >= 150.0 { + if chunkStats[j].weight >= 115 { // we split this in more go routines since this chunk has more work to do than the others. // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func() { + go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[j] <- s1 - }() + chChunks[chunkID] <- s1 + }(j) continue } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) @@ -329,8 +330,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { - mustBeExt := false +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { case 4: @@ -340,7 +340,9 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu case 8: return processChunkG2Jacobian[bucketg2JacExtendedC8] case 16: - if mustBeExt { + const batchSize = 640 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC16] } return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] @@ -369,24 +371,24 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - processChunk := getChunkProcessorG2(c) + processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { - processChunk = getChunkProcessorG2(lastC(c)) + processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) } - if chunkStats[j].weight >= 150.0 { + if chunkStats[j].weight >= 115 { // we split this in more go routines since this chunk has more work to do than the others. // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func() { + go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[j] <- s1 - }() + chChunks[chunkID] <- s1 + }(j) continue } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) @@ -446,7 +448,18 @@ func lastC(c uint64) uint64 { } type chunkStat struct { - weight float32 // relative weight compared to other chunks. + // relative weight of work compared to other chunks. 100.0 -> nominal weight. + weight int + + // average absolute deviation. this is meant to give a sense of statistical + // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + deviation int + + // count the number of buckets that are non zeroes for this chunk + nonZeroBuckets int + + // average ops per non-zero buckets + averageOpsPerBucket int } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits @@ -483,9 +496,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk := make(chan []int, nbTasks) + chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { opsPerChunk := make([]int, nbChunks) + opsPerBucketPerChunk := make([][]int, nbChunks) + for i := 0; i < len(opsPerBucketPerChunk); i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets + } for i := start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -521,16 +539,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint16 - // if digit is zero, no impact on result if digit == 0 { continue } + + var bits uint16 if digit > 0 { bits = uint16(digit) << 1 + opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 + opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits opsPerChunk[chunk]++ @@ -538,24 +558,72 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk <- opsPerChunk + chOpsPerBucketPerChunk <- opsPerBucketPerChunk }, nbTasks) // aggregate chunk stats + chunkStats := make([]chunkStat, nbChunks) close(chOpsPerChunk) + close(chOpsPerBucketPerChunk) opsPerChunk := make([]int, nbChunks) totalOps := 0 - for o := range chOpsPerChunk { - for i, nbOps := range o { + for chunks := range chOpsPerChunk { + for i, nbOps := range chunks { opsPerChunk[i] += nbOps totalOps += nbOps } } - chunkStats := make([]chunkStat, nbChunks) - target := float32(totalOps) / float32(nbChunks) + + opsPerBucketPerChunk := make([][]int, nbChunks) + for i := 0; i < len(opsPerBucketPerChunk); i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets + } + for chunks := range chOpsPerBucketPerChunk { + for i, opsPerBucket := range chunks { + for j, o := range opsPerBucket { + // bucket j in chunk i has o operations + if opsPerBucketPerChunk[i][j] == 0 && o != 0 { + chunkStats[i].nonZeroBuckets++ + } + opsPerBucketPerChunk[i][j] += o + } + } + } + + abs := func(v int) int { + if v < 0 { + return -v + } + return v + } + + // we know the total ops for the chunk, the number of non zero buckets + // so we can compute the deviation; + // TODO @gbotrel do that in go routines + for chunkID := 0; chunkID < len(chunkStats); chunkID++ { + nz := chunkStats[chunkID].nonZeroBuckets + if nz == 0 { + continue // ignore chunk, full of zeroes. + } + mean := opsPerChunk[chunkID] / nz + aad := 0 + averageOpsPerBucket := 0 + for _, bucketOps := range opsPerBucketPerChunk[chunkID] { + aad += abs(bucketOps - mean) + averageOpsPerBucket += bucketOps + } + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz + } + + target := totalOps / int(nbChunks) // what percentage are you of the target - for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + if target != 0 { + // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = opsPerChunk[i] * 100 / target + } } return digits, chunkStats diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index 2a1f0cda97..f7488f5dd7 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -240,7 +240,9 @@ func BenchmarkMultiExpG1(b *testing.B) { } } - // bad case for batch affine + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. for i := 0; i < len(sampleScalarsRedundant); i += 100 { for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] @@ -555,7 +557,9 @@ func BenchmarkMultiExpG2(b *testing.B) { } } - // bad case for batch affine + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. for i := 0; i < len(sampleScalarsRedundant); i += 100 { for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index 1c7ae6f672..32920cef38 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -136,8 +136,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { - mustBeExt := false +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { case 4: @@ -147,7 +146,9 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu case 8: return processChunkG1Jacobian[bucketg1JacExtendedC8] case 16: - if mustBeExt { + const batchSize = 640 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG1Jacobian[bucketg1JacExtendedC16] } return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] @@ -176,24 +177,24 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - processChunk := getChunkProcessorG1(c) + processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { - processChunk = getChunkProcessorG1(lastC(c)) + processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) } - if chunkStats[j].weight >= 150.0 { + if chunkStats[j].weight >= 115 { // we split this in more go routines since this chunk has more work to do than the others. // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g1JacExtended, 2) split := n / 2 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func() { + go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[j] <- s1 - }() + chChunks[chunkID] <- s1 + }(j) continue } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) @@ -329,8 +330,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul return p, nil } -func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { - mustBeExt := false +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { case 4: @@ -340,7 +340,9 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu case 8: return processChunkG2Jacobian[bucketg2JacExtendedC8] case 16: - if mustBeExt { + const batchSize = 640 + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunkG2Jacobian[bucketg2JacExtendedC16] } return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] @@ -369,24 +371,24 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - processChunk := getChunkProcessorG2(c) + processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { - processChunk = getChunkProcessorG2(lastC(c)) + processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) } - if chunkStats[j].weight >= 150.0 { + if chunkStats[j].weight >= 115 { // we split this in more go routines since this chunk has more work to do than the others. // else what would happen is this go routine would finish much later than the others. chSplit := make(chan g2JacExtended, 2) split := n / 2 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func() { + go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[j] <- s1 - }() + chChunks[chunkID] <- s1 + }(j) continue } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) @@ -446,7 +448,18 @@ func lastC(c uint64) uint64 { } type chunkStat struct { - weight float32 // relative weight compared to other chunks. + // relative weight of work compared to other chunks. 100.0 -> nominal weight. + weight int + + // average absolute deviation. this is meant to give a sense of statistical + // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + deviation int + + // count the number of buckets that are non zeroes for this chunk + nonZeroBuckets int + + // average ops per non-zero buckets + averageOpsPerBucket int } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits @@ -483,9 +496,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk := make(chan []int, nbTasks) + chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { opsPerChunk := make([]int, nbChunks) + opsPerBucketPerChunk := make([][]int, nbChunks) + for i := 0; i < len(opsPerBucketPerChunk); i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets + } for i := start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -521,16 +539,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint16 - // if digit is zero, no impact on result if digit == 0 { continue } + + var bits uint16 if digit > 0 { bits = uint16(digit) << 1 + opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 + opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits opsPerChunk[chunk]++ @@ -538,24 +558,72 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk <- opsPerChunk + chOpsPerBucketPerChunk <- opsPerBucketPerChunk }, nbTasks) // aggregate chunk stats + chunkStats := make([]chunkStat, nbChunks) close(chOpsPerChunk) + close(chOpsPerBucketPerChunk) opsPerChunk := make([]int, nbChunks) totalOps := 0 - for o := range chOpsPerChunk { - for i, nbOps := range o { + for chunks := range chOpsPerChunk { + for i, nbOps := range chunks { opsPerChunk[i] += nbOps totalOps += nbOps } } - chunkStats := make([]chunkStat, nbChunks) - target := float32(totalOps) / float32(nbChunks) + + opsPerBucketPerChunk := make([][]int, nbChunks) + for i := 0; i < len(opsPerBucketPerChunk); i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets + } + for chunks := range chOpsPerBucketPerChunk { + for i, opsPerBucket := range chunks { + for j, o := range opsPerBucket { + // bucket j in chunk i has o operations + if opsPerBucketPerChunk[i][j] == 0 && o != 0 { + chunkStats[i].nonZeroBuckets++ + } + opsPerBucketPerChunk[i][j] += o + } + } + } + + abs := func(v int) int { + if v < 0 { + return -v + } + return v + } + + // we know the total ops for the chunk, the number of non zero buckets + // so we can compute the deviation; + // TODO @gbotrel do that in go routines + for chunkID := 0; chunkID < len(chunkStats); chunkID++ { + nz := chunkStats[chunkID].nonZeroBuckets + if nz == 0 { + continue // ignore chunk, full of zeroes. + } + mean := opsPerChunk[chunkID] / nz + aad := 0 + averageOpsPerBucket := 0 + for _, bucketOps := range opsPerBucketPerChunk[chunkID] { + aad += abs(bucketOps - mean) + averageOpsPerBucket += bucketOps + } + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz + } + + target := totalOps / int(nbChunks) // what percentage are you of the target - for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + if target != 0 { + // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = opsPerChunk[i] * 100 / target + } } return digits, chunkStats diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index 589464949f..c7fdcf64cf 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -240,7 +240,9 @@ func BenchmarkMultiExpG1(b *testing.B) { } } - // bad case for batch affine + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. for i := 0; i < len(sampleScalarsRedundant); i += 100 { for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] @@ -555,7 +557,9 @@ func BenchmarkMultiExpG2(b *testing.B) { } } - // bad case for batch affine + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. for i := 0; i < len(sampleScalarsRedundant); i += 100 { for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index 4aa916fcfc..a274e26e66 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -55,7 +55,18 @@ func lastC(c uint64) uint64 { } type chunkStat struct { - weight float32 // relative weight compared to other chunks. + // relative weight of work compared to other chunks. 100.0 -> nominal weight. + weight int + + // average absolute deviation. this is meant to give a sense of statistical + // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + deviation int + + // count the number of buckets that are non zeroes for this chunk + nonZeroBuckets int + + // average ops per non-zero buckets + averageOpsPerBucket int } @@ -94,9 +105,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk := make(chan []int, nbTasks) + chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { opsPerChunk := make([]int, nbChunks) + opsPerBucketPerChunk := make([][]int, nbChunks) + for i:=0; i < len(opsPerBucketPerChunk);i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c-1))) // nbBuckets + } for i:=start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -133,16 +149,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks carry = 1 } - var bits uint16 - // if digit is zero, no impact on result if digit == 0 { continue } + + var bits uint16 if digit > 0 { bits = uint16(digit) << 1 + opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 + opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits opsPerChunk[chunk]++ @@ -150,26 +168,77 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } chOpsPerChunk <- opsPerChunk + chOpsPerBucketPerChunk <- opsPerBucketPerChunk }, nbTasks) // aggregate chunk stats + chunkStats := make([]chunkStat, nbChunks) close(chOpsPerChunk) + close(chOpsPerBucketPerChunk) opsPerChunk := make([]int, nbChunks) totalOps := 0 - for o := range chOpsPerChunk { - for i, nbOps := range o { + for chunks := range chOpsPerChunk { + for i, nbOps := range chunks { opsPerChunk[i]+=nbOps totalOps += nbOps } } - chunkStats := make([]chunkStat, nbChunks) - target := float32(totalOps) / float32(nbChunks) + + + opsPerBucketPerChunk := make([][]int, nbChunks) + for i:=0; i < len(opsPerBucketPerChunk);i++ { + opsPerBucketPerChunk[i] = make([]int, (1 << (c-1))) // nbBuckets + } + for chunks := range chOpsPerBucketPerChunk { + for i, opsPerBucket := range chunks { + for j, o := range opsPerBucket { + // bucket j in chunk i has o operations + if opsPerBucketPerChunk[i][j] == 0 && o != 0 { + chunkStats[i].nonZeroBuckets++ + } + opsPerBucketPerChunk[i][j] += o + } + } + } + + abs := func(v int) int { + if v < 0 { + return -v + } + return v + } + + // we know the total ops for the chunk, the number of non zero buckets + // so we can compute the deviation; + // TODO @gbotrel do that in go routines + for chunkID:=0; chunkID < len(chunkStats); chunkID++ { + nz := chunkStats[chunkID].nonZeroBuckets + if nz == 0 { + continue // ignore chunk, full of zeroes. + } + mean := opsPerChunk[chunkID] / nz + aad := 0 + averageOpsPerBucket := 0 + for _, bucketOps := range opsPerBucketPerChunk[chunkID] { + aad += abs(bucketOps - mean) + averageOpsPerBucket += bucketOps + } + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz + } + + + target := totalOps / int(nbChunks) // what percentage are you of the target - for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target + if target != 0 { + // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. + for i := 0; i < len(chunkStats); i++ { + chunkStats[i].weight = opsPerChunk[i] * 100 / target + } } + return digits, chunkStats } @@ -420,15 +489,16 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem } -func getChunkProcessor{{ $.UPointName }}(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16) { - mustBeExt := false +func getChunkProcessor{{ $.UPointName }}(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16) { switch c { {{range $c := $.CRange}} case {{$c}}: {{- if le $c 9}} return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] {{- else}} - if mustBeExt { + const batchSize = {{batchSize $c}} + edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) + if edgeCaseAffine { return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] } return processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}] @@ -459,24 +529,24 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - processChunk := getChunkProcessor{{ $.UPointName }}(c) + processChunk := getChunkProcessor{{ $.UPointName }}(c, chunkStats[j]) if j == int(nbChunks - 1) { - processChunk = getChunkProcessor{{ $.UPointName }}(lastC(c)) + processChunk = getChunkProcessor{{ $.UPointName }}(lastC(c), chunkStats[j]) } - if chunkStats[j].weight >= 150.0 { + if chunkStats[j].weight >= 115 { // we split this in more go routines since this chunk has more work to do than the others. // else what would happen is this go routine would finish much later than the others. chSplit := make(chan {{ $.TJacobianExtended }}, 2) split := n / 2 go processChunk(uint64(j),chSplit, c, points[:split], digits[j*n:(j*n)+split]) go processChunk(uint64(j),chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func() { + go func(chunkID int) { s1 := <-chSplit s2 := <-chSplit close(chSplit) s1.add(&s2) - chChunks[j] <- s1 - }() + chChunks[chunkID] <- s1 + }(j) continue } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index 9b259ebca4..f208891b47 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -253,7 +253,9 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { } } - // bad case for batch affine + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. for i:=0; i < len(sampleScalarsRedundant);i+=100 { for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { sampleScalarsRedundant[j] = sampleScalarsRedundant[i] From 1a91d5a74bb1e41cd93b9600e29cdc474876ddac Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 15 Nov 2022 17:38:49 -0600 Subject: [PATCH 28/43] checkpoint --- ecc/bls12-377/multiexp.go | 364 +++++++++++------- ecc/bls12-377/multiexp_affine.go | 104 ++++- ecc/bls12-377/multiexp_test.go | 4 +- ecc/bls12-378/multiexp.go | 364 +++++++++++------- ecc/bls12-378/multiexp_affine.go | 104 ++++- ecc/bls12-378/multiexp_test.go | 4 +- ecc/bls12-381/multiexp.go | 364 +++++++++++------- ecc/bls12-381/multiexp_affine.go | 104 ++++- ecc/bls12-381/multiexp_test.go | 4 +- ecc/bls24-315/multiexp.go | 364 +++++++++++------- ecc/bls24-315/multiexp_affine.go | 104 ++++- ecc/bls24-315/multiexp_test.go | 4 +- ecc/bls24-317/multiexp.go | 364 +++++++++++------- ecc/bls24-317/multiexp_affine.go | 104 ++++- ecc/bls24-317/multiexp_test.go | 4 +- ecc/bn254/multiexp.go | 364 +++++++++++------- ecc/bn254/multiexp_affine.go | 104 ++++- ecc/bn254/multiexp_test.go | 4 +- ecc/bw6-633/multiexp.go | 166 ++++---- ecc/bw6-633/multiexp_affine.go | 104 ++++- ecc/bw6-633/multiexp_test.go | 4 +- ecc/bw6-756/multiexp.go | 166 ++++---- ecc/bw6-756/multiexp_affine.go | 104 ++++- ecc/bw6-756/multiexp_test.go | 4 +- ecc/bw6-761/multiexp.go | 166 ++++---- ecc/bw6-761/multiexp_affine.go | 104 ++++- ecc/bw6-761/multiexp_test.go | 4 +- .../generator/ecc/template/multiexp.go.tmpl | 142 +++---- .../ecc/template/multiexp_affine.go.tmpl | 50 ++- .../ecc/template/tests/multiexp.go.tmpl | 2 +- 30 files changed, 2482 insertions(+), 1366 deletions(-) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index fdd6b005bf..b0c3c27e36 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -18,6 +18,7 @@ package bls12377 import ( "errors" + "fmt" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" "github.com/consensys/gnark-crypto/internal/parallel" @@ -140,66 +141,100 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC5] case 6: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC6] case 7: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC7] case 8: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC8] case 9: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - const batchSize = 80 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC10] - } - return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] + // const batchSize = 80 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC10] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - const batchSize = 150 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC11] - } - return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] + // const batchSize = 150 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC11] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - const batchSize = 200 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC12] - } - return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] + // const batchSize = 200 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC12] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - const batchSize = 350 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC13] - } - return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] + // const batchSize = 350 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC13] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - const batchSize = 400 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC14] - } - return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] + // const batchSize = 400 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC14] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - const batchSize = 500 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC15] - } - return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] + // const batchSize = 500 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC15] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - const batchSize = 640 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC16] - } - return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + // const batchSize = 640 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC16] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: // panic("will not happen c != previous values is not generated by templates") return processChunkG1Jacobian[bucketg1JacExtendedC16] @@ -223,8 +258,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - + // fmt.Printf("\n") + // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { + // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -382,66 +419,100 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC5] case 6: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC6] case 7: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC7] case 8: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC8] case 9: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - const batchSize = 80 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC10] - } - return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] + // const batchSize = 80 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC10] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - const batchSize = 150 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC11] - } - return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] + // const batchSize = 150 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC11] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - const batchSize = 200 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC12] - } - return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] + // const batchSize = 200 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC12] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - const batchSize = 350 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC13] - } - return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] + // const batchSize = 350 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC13] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - const batchSize = 400 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC14] - } - return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] + // const batchSize = 400 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC14] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - const batchSize = 500 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC15] - } - return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] + // const batchSize = 500 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC15] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - const batchSize = 640 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC16] - } - return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + // const batchSize = 640 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC16] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: // panic("will not happen c != previous values is not generated by templates") return processChunkG2Jacobian[bucketg2JacExtendedC16] @@ -465,8 +536,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - + // fmt.Printf("\n") + // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { + // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -558,6 +631,10 @@ type chunkStat struct { averageOpsPerBucket int } +func (c *chunkStat) String() string { + return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -591,15 +668,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - chOpsPerChunk := make(chan []int, nbTasks) - chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) - parallel.Execute(len(scalars), func(start, end int) { - opsPerChunk := make([]int, nbChunks) - opsPerBucketPerChunk := make([][]int, nbChunks) - for i := 0; i < len(opsPerBucketPerChunk); i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets - } for i := start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -643,50 +712,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks var bits uint16 if digit > 0 { bits = uint16(digit) << 1 - opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 - opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits - opsPerChunk[chunk]++ } } - chOpsPerChunk <- opsPerChunk - chOpsPerBucketPerChunk <- opsPerBucketPerChunk - }, nbTasks) - // aggregate chunk stats - chunkStats := make([]chunkStat, nbChunks) - close(chOpsPerChunk) - close(chOpsPerBucketPerChunk) - opsPerChunk := make([]int, nbChunks) - totalOps := 0 - for chunks := range chOpsPerChunk { - for i, nbOps := range chunks { - opsPerChunk[i] += nbOps - totalOps += nbOps - } - } - - opsPerBucketPerChunk := make([][]int, nbChunks) - for i := 0; i < len(opsPerBucketPerChunk); i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets - } - for chunks := range chOpsPerBucketPerChunk { - for i, opsPerBucket := range chunks { - for j, o := range opsPerBucket { - // bucket j in chunk i has o operations - if opsPerBucketPerChunk[i][j] == 0 && o != 0 { - chunkStats[i].nonZeroBuckets++ - } - opsPerBucketPerChunk[i][j] += o - } - } - } - abs := func(v int) int { if v < 0 { return -v @@ -694,31 +728,67 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks return v } - // we know the total ops for the chunk, the number of non zero buckets - // so we can compute the deviation; - // TODO @gbotrel do that in go routines - for chunkID := 0; chunkID < len(chunkStats); chunkID++ { - nz := chunkStats[chunkID].nonZeroBuckets - if nz == 0 { - continue // ignore chunk, full of zeroes. - } - mean := opsPerChunk[chunkID] / nz - aad := 0 - averageOpsPerBucket := 0 - for _, bucketOps := range opsPerBucketPerChunk[chunkID] { - aad += abs(bucketOps - mean) - averageOpsPerBucket += bucketOps + // aggregate chunk stats + chunkStats := make([]chunkStat, nbChunks) + parallel.Execute(len(chunkStats), func(start, end int) { + // for each chunk compute the statistics + for chunkID := start; chunkID < end; chunkID++ { + var opsPerBucket [1 << 15]int // max value is 16 for c + // digits for the chunk + chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] + + totalOps := 0 + nz := 0 // non zero buckets count + for _, digit := range chunkDigits { + if digit == 0 { + continue + } + totalOps++ + bucketID := digit >> 1 + if digit&1 == 0 { + bucketID -= 1 + } + if opsPerBucket[bucketID] == 0 { + nz++ + } + opsPerBucket[bucketID]++ + } + chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after + chunkStats[chunkID].nonZeroBuckets = nz + + if nz == 0 { + return // no ops, only zeroes + } + + bound := 1 << (c - 1) + if chunkID == int(nbChunks-1) { + bound = 1 << (lastC(c) - 1) + } + mean := totalOps / nz + aad := 0 + averageOpsPerBucket := 0 + for b := 0; b < bound; b++ { + if opsPerBucket[b] == 0 { + continue + } + aad += abs(opsPerBucket[b] - mean) + averageOpsPerBucket += opsPerBucket[b] + } + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + }, nbTasks) + + totalOps := 0 + for _, stat := range chunkStats { + totalOps += stat.weight } target := totalOps / int(nbChunks) - // what percentage are you of the target if target != 0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = opsPerChunk[i] * 100 / target + chunkStats[i].weight = (chunkStats[i].weight * 100) / target } } diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index 65e531185b..e953a6f079 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -36,7 +36,7 @@ func (o batchOpG1Affine) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( +func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( chunk uint64, chRes chan<- g1JacExtended, c uint64, @@ -45,8 +45,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af // init the buckets var buckets B + var bucketsJE BJE for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() + bucketsJE[i].setInfinity() } // setup for the batch affine; @@ -71,6 +73,31 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG1Affine) { + // @precondition: ensures bucket is not "used" in current batch + BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } add := func(bucketID uint16, PP *G1Affine, isAdd bool) { // @precondition: ensures bucket is not "used" in current batch @@ -115,12 +142,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af cptAdd++ } + flushQueue := func() { + for i := 0; i < qID; i++ { + bucketsJE[queue[i].bucketID].addMixed(&queue[i].point) + } + qID = 0 + } + processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { continue } - add(queue[i].bucketID, &queue[i].point, true) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -154,8 +188,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af // queue is full, flush it. if qID == len(queue)-1 { - executeAndReset() - processQueue() + flushQueue() } continue } @@ -164,15 +197,16 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() + processQueue() // TODO top queue only } } // empty the queue - for qID != 0 { - processQueue() - executeAndReset() - } + flushQueue() + // for qID != 0 { + // processQueue() + // executeAndReset() + // } // flush items in batch. executeAndReset() @@ -316,7 +350,7 @@ func (o batchOpG2Affine) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( +func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( chunk uint64, chRes chan<- g2JacExtended, c uint64, @@ -325,8 +359,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af // init the buckets var buckets B + var bucketsJE BJE for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() + bucketsJE[i].setInfinity() } // setup for the batch affine; @@ -351,6 +387,31 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG2Affine) { + // @precondition: ensures bucket is not "used" in current batch + BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } add := func(bucketID uint16, PP *G2Affine, isAdd bool) { // @precondition: ensures bucket is not "used" in current batch @@ -395,12 +456,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af cptAdd++ } + flushQueue := func() { + for i := 0; i < qID; i++ { + bucketsJE[queue[i].bucketID].addMixed(&queue[i].point) + } + qID = 0 + } + processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { continue } - add(queue[i].bucketID, &queue[i].point, true) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -434,8 +502,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af // queue is full, flush it. if qID == len(queue)-1 { - executeAndReset() - processQueue() + flushQueue() } continue } @@ -444,15 +511,16 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() + processQueue() // TODO top queue only } } // empty the queue - for qID != 0 { - processQueue() - executeAndReset() - } + flushQueue() + // for qID != 0 { + // processQueue() + // executeAndReset() + // } // flush items in batch. executeAndReset() diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index 3f5ea45edd..38d85e79ea 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 15; i <= pow; i++ { + for i := 22; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 15; i <= pow; i++ { + for i := 22; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index 711a527770..5833c4e009 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -18,6 +18,7 @@ package bls12378 import ( "errors" + "fmt" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-378/fr" "github.com/consensys/gnark-crypto/internal/parallel" @@ -140,66 +141,100 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC5] case 6: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC6] case 7: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC7] case 8: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC8] case 9: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - const batchSize = 80 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC10] - } - return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] + // const batchSize = 80 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC10] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - const batchSize = 150 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC11] - } - return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] + // const batchSize = 150 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC11] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - const batchSize = 200 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC12] - } - return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] + // const batchSize = 200 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC12] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - const batchSize = 350 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC13] - } - return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] + // const batchSize = 350 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC13] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - const batchSize = 400 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC14] - } - return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] + // const batchSize = 400 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC14] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - const batchSize = 500 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC15] - } - return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] + // const batchSize = 500 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC15] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - const batchSize = 640 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC16] - } - return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + // const batchSize = 640 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC16] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: // panic("will not happen c != previous values is not generated by templates") return processChunkG1Jacobian[bucketg1JacExtendedC16] @@ -223,8 +258,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - + // fmt.Printf("\n") + // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { + // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -382,66 +419,100 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC5] case 6: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC6] case 7: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC7] case 8: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC8] case 9: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - const batchSize = 80 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC10] - } - return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] + // const batchSize = 80 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC10] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - const batchSize = 150 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC11] - } - return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] + // const batchSize = 150 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC11] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - const batchSize = 200 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC12] - } - return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] + // const batchSize = 200 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC12] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - const batchSize = 350 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC13] - } - return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] + // const batchSize = 350 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC13] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - const batchSize = 400 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC14] - } - return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] + // const batchSize = 400 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC14] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - const batchSize = 500 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC15] - } - return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] + // const batchSize = 500 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC15] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - const batchSize = 640 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC16] - } - return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + // const batchSize = 640 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC16] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: // panic("will not happen c != previous values is not generated by templates") return processChunkG2Jacobian[bucketg2JacExtendedC16] @@ -465,8 +536,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - + // fmt.Printf("\n") + // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { + // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -558,6 +631,10 @@ type chunkStat struct { averageOpsPerBucket int } +func (c *chunkStat) String() string { + return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -591,15 +668,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - chOpsPerChunk := make(chan []int, nbTasks) - chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) - parallel.Execute(len(scalars), func(start, end int) { - opsPerChunk := make([]int, nbChunks) - opsPerBucketPerChunk := make([][]int, nbChunks) - for i := 0; i < len(opsPerBucketPerChunk); i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets - } for i := start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -643,50 +712,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks var bits uint16 if digit > 0 { bits = uint16(digit) << 1 - opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 - opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits - opsPerChunk[chunk]++ } } - chOpsPerChunk <- opsPerChunk - chOpsPerBucketPerChunk <- opsPerBucketPerChunk - }, nbTasks) - // aggregate chunk stats - chunkStats := make([]chunkStat, nbChunks) - close(chOpsPerChunk) - close(chOpsPerBucketPerChunk) - opsPerChunk := make([]int, nbChunks) - totalOps := 0 - for chunks := range chOpsPerChunk { - for i, nbOps := range chunks { - opsPerChunk[i] += nbOps - totalOps += nbOps - } - } - - opsPerBucketPerChunk := make([][]int, nbChunks) - for i := 0; i < len(opsPerBucketPerChunk); i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets - } - for chunks := range chOpsPerBucketPerChunk { - for i, opsPerBucket := range chunks { - for j, o := range opsPerBucket { - // bucket j in chunk i has o operations - if opsPerBucketPerChunk[i][j] == 0 && o != 0 { - chunkStats[i].nonZeroBuckets++ - } - opsPerBucketPerChunk[i][j] += o - } - } - } - abs := func(v int) int { if v < 0 { return -v @@ -694,31 +728,67 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks return v } - // we know the total ops for the chunk, the number of non zero buckets - // so we can compute the deviation; - // TODO @gbotrel do that in go routines - for chunkID := 0; chunkID < len(chunkStats); chunkID++ { - nz := chunkStats[chunkID].nonZeroBuckets - if nz == 0 { - continue // ignore chunk, full of zeroes. - } - mean := opsPerChunk[chunkID] / nz - aad := 0 - averageOpsPerBucket := 0 - for _, bucketOps := range opsPerBucketPerChunk[chunkID] { - aad += abs(bucketOps - mean) - averageOpsPerBucket += bucketOps + // aggregate chunk stats + chunkStats := make([]chunkStat, nbChunks) + parallel.Execute(len(chunkStats), func(start, end int) { + // for each chunk compute the statistics + for chunkID := start; chunkID < end; chunkID++ { + var opsPerBucket [1 << 15]int // max value is 16 for c + // digits for the chunk + chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] + + totalOps := 0 + nz := 0 // non zero buckets count + for _, digit := range chunkDigits { + if digit == 0 { + continue + } + totalOps++ + bucketID := digit >> 1 + if digit&1 == 0 { + bucketID -= 1 + } + if opsPerBucket[bucketID] == 0 { + nz++ + } + opsPerBucket[bucketID]++ + } + chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after + chunkStats[chunkID].nonZeroBuckets = nz + + if nz == 0 { + return // no ops, only zeroes + } + + bound := 1 << (c - 1) + if chunkID == int(nbChunks-1) { + bound = 1 << (lastC(c) - 1) + } + mean := totalOps / nz + aad := 0 + averageOpsPerBucket := 0 + for b := 0; b < bound; b++ { + if opsPerBucket[b] == 0 { + continue + } + aad += abs(opsPerBucket[b] - mean) + averageOpsPerBucket += opsPerBucket[b] + } + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + }, nbTasks) + + totalOps := 0 + for _, stat := range chunkStats { + totalOps += stat.weight } target := totalOps / int(nbChunks) - // what percentage are you of the target if target != 0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = opsPerChunk[i] * 100 / target + chunkStats[i].weight = (chunkStats[i].weight * 100) / target } } diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index f48f316a4a..533cb7304c 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -36,7 +36,7 @@ func (o batchOpG1Affine) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( +func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( chunk uint64, chRes chan<- g1JacExtended, c uint64, @@ -45,8 +45,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af // init the buckets var buckets B + var bucketsJE BJE for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() + bucketsJE[i].setInfinity() } // setup for the batch affine; @@ -71,6 +73,31 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG1Affine) { + // @precondition: ensures bucket is not "used" in current batch + BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } add := func(bucketID uint16, PP *G1Affine, isAdd bool) { // @precondition: ensures bucket is not "used" in current batch @@ -115,12 +142,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af cptAdd++ } + flushQueue := func() { + for i := 0; i < qID; i++ { + bucketsJE[queue[i].bucketID].addMixed(&queue[i].point) + } + qID = 0 + } + processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { continue } - add(queue[i].bucketID, &queue[i].point, true) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -154,8 +188,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af // queue is full, flush it. if qID == len(queue)-1 { - executeAndReset() - processQueue() + flushQueue() } continue } @@ -164,15 +197,16 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() + processQueue() // TODO top queue only } } // empty the queue - for qID != 0 { - processQueue() - executeAndReset() - } + flushQueue() + // for qID != 0 { + // processQueue() + // executeAndReset() + // } // flush items in batch. executeAndReset() @@ -316,7 +350,7 @@ func (o batchOpG2Affine) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( +func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( chunk uint64, chRes chan<- g2JacExtended, c uint64, @@ -325,8 +359,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af // init the buckets var buckets B + var bucketsJE BJE for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() + bucketsJE[i].setInfinity() } // setup for the batch affine; @@ -351,6 +387,31 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG2Affine) { + // @precondition: ensures bucket is not "used" in current batch + BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } add := func(bucketID uint16, PP *G2Affine, isAdd bool) { // @precondition: ensures bucket is not "used" in current batch @@ -395,12 +456,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af cptAdd++ } + flushQueue := func() { + for i := 0; i < qID; i++ { + bucketsJE[queue[i].bucketID].addMixed(&queue[i].point) + } + qID = 0 + } + processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { continue } - add(queue[i].bucketID, &queue[i].point, true) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -434,8 +502,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af // queue is full, flush it. if qID == len(queue)-1 { - executeAndReset() - processQueue() + flushQueue() } continue } @@ -444,15 +511,16 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() + processQueue() // TODO top queue only } } // empty the queue - for qID != 0 { - processQueue() - executeAndReset() - } + flushQueue() + // for qID != 0 { + // processQueue() + // executeAndReset() + // } // flush items in batch. executeAndReset() diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index fd3aee65e6..846eab44a1 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 15; i <= pow; i++ { + for i := 22; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 15; i <= pow; i++ { + for i := 22; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index 7402fa0c62..50b6f180d3 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -18,6 +18,7 @@ package bls12381 import ( "errors" + "fmt" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-381/fr" "github.com/consensys/gnark-crypto/internal/parallel" @@ -140,66 +141,100 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC5] case 6: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC6] case 7: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC7] case 8: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC8] case 9: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - const batchSize = 80 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC10] - } - return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] + // const batchSize = 80 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC10] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - const batchSize = 150 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC11] - } - return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] + // const batchSize = 150 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC11] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - const batchSize = 200 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC12] - } - return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] + // const batchSize = 200 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC12] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - const batchSize = 350 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC13] - } - return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] + // const batchSize = 350 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC13] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - const batchSize = 400 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC14] - } - return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] + // const batchSize = 400 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC14] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - const batchSize = 500 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC15] - } - return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] + // const batchSize = 500 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC15] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - const batchSize = 640 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC16] - } - return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + // const batchSize = 640 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC16] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: // panic("will not happen c != previous values is not generated by templates") return processChunkG1Jacobian[bucketg1JacExtendedC16] @@ -223,8 +258,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - + // fmt.Printf("\n") + // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { + // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -382,66 +419,100 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC5] case 6: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC6] case 7: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC7] case 8: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC8] case 9: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - const batchSize = 80 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC10] - } - return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] + // const batchSize = 80 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC10] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - const batchSize = 150 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC11] - } - return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] + // const batchSize = 150 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC11] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - const batchSize = 200 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC12] - } - return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] + // const batchSize = 200 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC12] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - const batchSize = 350 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC13] - } - return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] + // const batchSize = 350 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC13] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - const batchSize = 400 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC14] - } - return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] + // const batchSize = 400 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC14] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - const batchSize = 500 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC15] - } - return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] + // const batchSize = 500 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC15] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - const batchSize = 640 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC16] - } - return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + // const batchSize = 640 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC16] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: // panic("will not happen c != previous values is not generated by templates") return processChunkG2Jacobian[bucketg2JacExtendedC16] @@ -465,8 +536,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - + // fmt.Printf("\n") + // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { + // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -558,6 +631,10 @@ type chunkStat struct { averageOpsPerBucket int } +func (c *chunkStat) String() string { + return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -591,15 +668,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - chOpsPerChunk := make(chan []int, nbTasks) - chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) - parallel.Execute(len(scalars), func(start, end int) { - opsPerChunk := make([]int, nbChunks) - opsPerBucketPerChunk := make([][]int, nbChunks) - for i := 0; i < len(opsPerBucketPerChunk); i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets - } for i := start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -643,50 +712,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks var bits uint16 if digit > 0 { bits = uint16(digit) << 1 - opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 - opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits - opsPerChunk[chunk]++ } } - chOpsPerChunk <- opsPerChunk - chOpsPerBucketPerChunk <- opsPerBucketPerChunk - }, nbTasks) - // aggregate chunk stats - chunkStats := make([]chunkStat, nbChunks) - close(chOpsPerChunk) - close(chOpsPerBucketPerChunk) - opsPerChunk := make([]int, nbChunks) - totalOps := 0 - for chunks := range chOpsPerChunk { - for i, nbOps := range chunks { - opsPerChunk[i] += nbOps - totalOps += nbOps - } - } - - opsPerBucketPerChunk := make([][]int, nbChunks) - for i := 0; i < len(opsPerBucketPerChunk); i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets - } - for chunks := range chOpsPerBucketPerChunk { - for i, opsPerBucket := range chunks { - for j, o := range opsPerBucket { - // bucket j in chunk i has o operations - if opsPerBucketPerChunk[i][j] == 0 && o != 0 { - chunkStats[i].nonZeroBuckets++ - } - opsPerBucketPerChunk[i][j] += o - } - } - } - abs := func(v int) int { if v < 0 { return -v @@ -694,31 +728,67 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks return v } - // we know the total ops for the chunk, the number of non zero buckets - // so we can compute the deviation; - // TODO @gbotrel do that in go routines - for chunkID := 0; chunkID < len(chunkStats); chunkID++ { - nz := chunkStats[chunkID].nonZeroBuckets - if nz == 0 { - continue // ignore chunk, full of zeroes. - } - mean := opsPerChunk[chunkID] / nz - aad := 0 - averageOpsPerBucket := 0 - for _, bucketOps := range opsPerBucketPerChunk[chunkID] { - aad += abs(bucketOps - mean) - averageOpsPerBucket += bucketOps + // aggregate chunk stats + chunkStats := make([]chunkStat, nbChunks) + parallel.Execute(len(chunkStats), func(start, end int) { + // for each chunk compute the statistics + for chunkID := start; chunkID < end; chunkID++ { + var opsPerBucket [1 << 15]int // max value is 16 for c + // digits for the chunk + chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] + + totalOps := 0 + nz := 0 // non zero buckets count + for _, digit := range chunkDigits { + if digit == 0 { + continue + } + totalOps++ + bucketID := digit >> 1 + if digit&1 == 0 { + bucketID -= 1 + } + if opsPerBucket[bucketID] == 0 { + nz++ + } + opsPerBucket[bucketID]++ + } + chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after + chunkStats[chunkID].nonZeroBuckets = nz + + if nz == 0 { + return // no ops, only zeroes + } + + bound := 1 << (c - 1) + if chunkID == int(nbChunks-1) { + bound = 1 << (lastC(c) - 1) + } + mean := totalOps / nz + aad := 0 + averageOpsPerBucket := 0 + for b := 0; b < bound; b++ { + if opsPerBucket[b] == 0 { + continue + } + aad += abs(opsPerBucket[b] - mean) + averageOpsPerBucket += opsPerBucket[b] + } + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + }, nbTasks) + + totalOps := 0 + for _, stat := range chunkStats { + totalOps += stat.weight } target := totalOps / int(nbChunks) - // what percentage are you of the target if target != 0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = opsPerChunk[i] * 100 / target + chunkStats[i].weight = (chunkStats[i].weight * 100) / target } } diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index 2e3776394d..61ebeeebb0 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -36,7 +36,7 @@ func (o batchOpG1Affine) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( +func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( chunk uint64, chRes chan<- g1JacExtended, c uint64, @@ -45,8 +45,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af // init the buckets var buckets B + var bucketsJE BJE for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() + bucketsJE[i].setInfinity() } // setup for the batch affine; @@ -71,6 +73,31 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG1Affine) { + // @precondition: ensures bucket is not "used" in current batch + BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } add := func(bucketID uint16, PP *G1Affine, isAdd bool) { // @precondition: ensures bucket is not "used" in current batch @@ -115,12 +142,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af cptAdd++ } + flushQueue := func() { + for i := 0; i < qID; i++ { + bucketsJE[queue[i].bucketID].addMixed(&queue[i].point) + } + qID = 0 + } + processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { continue } - add(queue[i].bucketID, &queue[i].point, true) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -154,8 +188,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af // queue is full, flush it. if qID == len(queue)-1 { - executeAndReset() - processQueue() + flushQueue() } continue } @@ -164,15 +197,16 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() + processQueue() // TODO top queue only } } // empty the queue - for qID != 0 { - processQueue() - executeAndReset() - } + flushQueue() + // for qID != 0 { + // processQueue() + // executeAndReset() + // } // flush items in batch. executeAndReset() @@ -316,7 +350,7 @@ func (o batchOpG2Affine) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( +func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( chunk uint64, chRes chan<- g2JacExtended, c uint64, @@ -325,8 +359,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af // init the buckets var buckets B + var bucketsJE BJE for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() + bucketsJE[i].setInfinity() } // setup for the batch affine; @@ -351,6 +387,31 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG2Affine) { + // @precondition: ensures bucket is not "used" in current batch + BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } add := func(bucketID uint16, PP *G2Affine, isAdd bool) { // @precondition: ensures bucket is not "used" in current batch @@ -395,12 +456,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af cptAdd++ } + flushQueue := func() { + for i := 0; i < qID; i++ { + bucketsJE[queue[i].bucketID].addMixed(&queue[i].point) + } + qID = 0 + } + processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { continue } - add(queue[i].bucketID, &queue[i].point, true) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -434,8 +502,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af // queue is full, flush it. if qID == len(queue)-1 { - executeAndReset() - processQueue() + flushQueue() } continue } @@ -444,15 +511,16 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() + processQueue() // TODO top queue only } } // empty the queue - for qID != 0 { - processQueue() - executeAndReset() - } + flushQueue() + // for qID != 0 { + // processQueue() + // executeAndReset() + // } // flush items in batch. executeAndReset() diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index 9ce352672c..65e74e0491 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 15; i <= pow; i++ { + for i := 22; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 15; i <= pow; i++ { + for i := 22; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index f1b13bdfc1..3c05caa715 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -18,6 +18,7 @@ package bls24315 import ( "errors" + "fmt" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls24-315/fr" "github.com/consensys/gnark-crypto/internal/parallel" @@ -140,66 +141,100 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC5] case 6: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC6] case 7: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC7] case 8: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC8] case 9: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - const batchSize = 80 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC10] - } - return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] + // const batchSize = 80 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC10] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - const batchSize = 150 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC11] - } - return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] + // const batchSize = 150 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC11] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - const batchSize = 200 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC12] - } - return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] + // const batchSize = 200 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC12] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - const batchSize = 350 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC13] - } - return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] + // const batchSize = 350 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC13] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - const batchSize = 400 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC14] - } - return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] + // const batchSize = 400 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC14] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - const batchSize = 500 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC15] - } - return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] + // const batchSize = 500 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC15] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - const batchSize = 640 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC16] - } - return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + // const batchSize = 640 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC16] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: // panic("will not happen c != previous values is not generated by templates") return processChunkG1Jacobian[bucketg1JacExtendedC16] @@ -223,8 +258,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - + // fmt.Printf("\n") + // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { + // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -382,66 +419,100 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC5] case 6: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC6] case 7: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC7] case 8: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC8] case 9: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - const batchSize = 80 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC10] - } - return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] + // const batchSize = 80 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC10] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - const batchSize = 150 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC11] - } - return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] + // const batchSize = 150 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC11] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - const batchSize = 200 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC12] - } - return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] + // const batchSize = 200 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC12] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - const batchSize = 350 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC13] - } - return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] + // const batchSize = 350 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC13] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - const batchSize = 400 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC14] - } - return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] + // const batchSize = 400 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC14] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - const batchSize = 500 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC15] - } - return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] + // const batchSize = 500 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC15] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - const batchSize = 640 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC16] - } - return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + // const batchSize = 640 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC16] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: // panic("will not happen c != previous values is not generated by templates") return processChunkG2Jacobian[bucketg2JacExtendedC16] @@ -465,8 +536,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - + // fmt.Printf("\n") + // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { + // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -558,6 +631,10 @@ type chunkStat struct { averageOpsPerBucket int } +func (c *chunkStat) String() string { + return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -591,15 +668,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - chOpsPerChunk := make(chan []int, nbTasks) - chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) - parallel.Execute(len(scalars), func(start, end int) { - opsPerChunk := make([]int, nbChunks) - opsPerBucketPerChunk := make([][]int, nbChunks) - for i := 0; i < len(opsPerBucketPerChunk); i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets - } for i := start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -643,50 +712,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks var bits uint16 if digit > 0 { bits = uint16(digit) << 1 - opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 - opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits - opsPerChunk[chunk]++ } } - chOpsPerChunk <- opsPerChunk - chOpsPerBucketPerChunk <- opsPerBucketPerChunk - }, nbTasks) - // aggregate chunk stats - chunkStats := make([]chunkStat, nbChunks) - close(chOpsPerChunk) - close(chOpsPerBucketPerChunk) - opsPerChunk := make([]int, nbChunks) - totalOps := 0 - for chunks := range chOpsPerChunk { - for i, nbOps := range chunks { - opsPerChunk[i] += nbOps - totalOps += nbOps - } - } - - opsPerBucketPerChunk := make([][]int, nbChunks) - for i := 0; i < len(opsPerBucketPerChunk); i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets - } - for chunks := range chOpsPerBucketPerChunk { - for i, opsPerBucket := range chunks { - for j, o := range opsPerBucket { - // bucket j in chunk i has o operations - if opsPerBucketPerChunk[i][j] == 0 && o != 0 { - chunkStats[i].nonZeroBuckets++ - } - opsPerBucketPerChunk[i][j] += o - } - } - } - abs := func(v int) int { if v < 0 { return -v @@ -694,31 +728,67 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks return v } - // we know the total ops for the chunk, the number of non zero buckets - // so we can compute the deviation; - // TODO @gbotrel do that in go routines - for chunkID := 0; chunkID < len(chunkStats); chunkID++ { - nz := chunkStats[chunkID].nonZeroBuckets - if nz == 0 { - continue // ignore chunk, full of zeroes. - } - mean := opsPerChunk[chunkID] / nz - aad := 0 - averageOpsPerBucket := 0 - for _, bucketOps := range opsPerBucketPerChunk[chunkID] { - aad += abs(bucketOps - mean) - averageOpsPerBucket += bucketOps + // aggregate chunk stats + chunkStats := make([]chunkStat, nbChunks) + parallel.Execute(len(chunkStats), func(start, end int) { + // for each chunk compute the statistics + for chunkID := start; chunkID < end; chunkID++ { + var opsPerBucket [1 << 15]int // max value is 16 for c + // digits for the chunk + chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] + + totalOps := 0 + nz := 0 // non zero buckets count + for _, digit := range chunkDigits { + if digit == 0 { + continue + } + totalOps++ + bucketID := digit >> 1 + if digit&1 == 0 { + bucketID -= 1 + } + if opsPerBucket[bucketID] == 0 { + nz++ + } + opsPerBucket[bucketID]++ + } + chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after + chunkStats[chunkID].nonZeroBuckets = nz + + if nz == 0 { + return // no ops, only zeroes + } + + bound := 1 << (c - 1) + if chunkID == int(nbChunks-1) { + bound = 1 << (lastC(c) - 1) + } + mean := totalOps / nz + aad := 0 + averageOpsPerBucket := 0 + for b := 0; b < bound; b++ { + if opsPerBucket[b] == 0 { + continue + } + aad += abs(opsPerBucket[b] - mean) + averageOpsPerBucket += opsPerBucket[b] + } + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + }, nbTasks) + + totalOps := 0 + for _, stat := range chunkStats { + totalOps += stat.weight } target := totalOps / int(nbChunks) - // what percentage are you of the target if target != 0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = opsPerChunk[i] * 100 / target + chunkStats[i].weight = (chunkStats[i].weight * 100) / target } } diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index d253497f17..85af6357fd 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -36,7 +36,7 @@ func (o batchOpG1Affine) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( +func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( chunk uint64, chRes chan<- g1JacExtended, c uint64, @@ -45,8 +45,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af // init the buckets var buckets B + var bucketsJE BJE for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() + bucketsJE[i].setInfinity() } // setup for the batch affine; @@ -71,6 +73,31 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG1Affine) { + // @precondition: ensures bucket is not "used" in current batch + BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } add := func(bucketID uint16, PP *G1Affine, isAdd bool) { // @precondition: ensures bucket is not "used" in current batch @@ -115,12 +142,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af cptAdd++ } + flushQueue := func() { + for i := 0; i < qID; i++ { + bucketsJE[queue[i].bucketID].addMixed(&queue[i].point) + } + qID = 0 + } + processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { continue } - add(queue[i].bucketID, &queue[i].point, true) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -154,8 +188,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af // queue is full, flush it. if qID == len(queue)-1 { - executeAndReset() - processQueue() + flushQueue() } continue } @@ -164,15 +197,16 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() + processQueue() // TODO top queue only } } // empty the queue - for qID != 0 { - processQueue() - executeAndReset() - } + flushQueue() + // for qID != 0 { + // processQueue() + // executeAndReset() + // } // flush items in batch. executeAndReset() @@ -316,7 +350,7 @@ func (o batchOpG2Affine) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( +func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( chunk uint64, chRes chan<- g2JacExtended, c uint64, @@ -325,8 +359,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af // init the buckets var buckets B + var bucketsJE BJE for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() + bucketsJE[i].setInfinity() } // setup for the batch affine; @@ -351,6 +387,31 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG2Affine) { + // @precondition: ensures bucket is not "used" in current batch + BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } add := func(bucketID uint16, PP *G2Affine, isAdd bool) { // @precondition: ensures bucket is not "used" in current batch @@ -395,12 +456,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af cptAdd++ } + flushQueue := func() { + for i := 0; i < qID; i++ { + bucketsJE[queue[i].bucketID].addMixed(&queue[i].point) + } + qID = 0 + } + processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { continue } - add(queue[i].bucketID, &queue[i].point, true) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -434,8 +502,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af // queue is full, flush it. if qID == len(queue)-1 { - executeAndReset() - processQueue() + flushQueue() } continue } @@ -444,15 +511,16 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() + processQueue() // TODO top queue only } } // empty the queue - for qID != 0 { - processQueue() - executeAndReset() - } + flushQueue() + // for qID != 0 { + // processQueue() + // executeAndReset() + // } // flush items in batch. executeAndReset() diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index c40b9ccf21..1eb2ff3e0f 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 15; i <= pow; i++ { + for i := 22; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 15; i <= pow; i++ { + for i := 22; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index e1c79c6ba5..2f80a71f5c 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -18,6 +18,7 @@ package bls24317 import ( "errors" + "fmt" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls24-317/fr" "github.com/consensys/gnark-crypto/internal/parallel" @@ -140,66 +141,100 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC5] case 6: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC6] case 7: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC7] case 8: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC8] case 9: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - const batchSize = 80 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC10] - } - return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] + // const batchSize = 80 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC10] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - const batchSize = 150 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC11] - } - return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] + // const batchSize = 150 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC11] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - const batchSize = 200 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC12] - } - return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] + // const batchSize = 200 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC12] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - const batchSize = 350 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC13] - } - return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] + // const batchSize = 350 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC13] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - const batchSize = 400 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC14] - } - return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] + // const batchSize = 400 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC14] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - const batchSize = 500 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC15] - } - return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] + // const batchSize = 500 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC15] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - const batchSize = 640 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC16] - } - return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + // const batchSize = 640 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC16] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: // panic("will not happen c != previous values is not generated by templates") return processChunkG1Jacobian[bucketg1JacExtendedC16] @@ -223,8 +258,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - + // fmt.Printf("\n") + // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { + // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -382,66 +419,100 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC5] case 6: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC6] case 7: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC7] case 8: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC8] case 9: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - const batchSize = 80 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC10] - } - return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] + // const batchSize = 80 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC10] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - const batchSize = 150 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC11] - } - return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] + // const batchSize = 150 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC11] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - const batchSize = 200 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC12] - } - return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] + // const batchSize = 200 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC12] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - const batchSize = 350 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC13] - } - return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] + // const batchSize = 350 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC13] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - const batchSize = 400 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC14] - } - return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] + // const batchSize = 400 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC14] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - const batchSize = 500 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC15] - } - return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] + // const batchSize = 500 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC15] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - const batchSize = 640 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC16] - } - return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + // const batchSize = 640 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC16] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: // panic("will not happen c != previous values is not generated by templates") return processChunkG2Jacobian[bucketg2JacExtendedC16] @@ -465,8 +536,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - + // fmt.Printf("\n") + // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { + // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -558,6 +631,10 @@ type chunkStat struct { averageOpsPerBucket int } +func (c *chunkStat) String() string { + return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -591,15 +668,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - chOpsPerChunk := make(chan []int, nbTasks) - chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) - parallel.Execute(len(scalars), func(start, end int) { - opsPerChunk := make([]int, nbChunks) - opsPerBucketPerChunk := make([][]int, nbChunks) - for i := 0; i < len(opsPerBucketPerChunk); i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets - } for i := start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -643,50 +712,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks var bits uint16 if digit > 0 { bits = uint16(digit) << 1 - opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 - opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits - opsPerChunk[chunk]++ } } - chOpsPerChunk <- opsPerChunk - chOpsPerBucketPerChunk <- opsPerBucketPerChunk - }, nbTasks) - // aggregate chunk stats - chunkStats := make([]chunkStat, nbChunks) - close(chOpsPerChunk) - close(chOpsPerBucketPerChunk) - opsPerChunk := make([]int, nbChunks) - totalOps := 0 - for chunks := range chOpsPerChunk { - for i, nbOps := range chunks { - opsPerChunk[i] += nbOps - totalOps += nbOps - } - } - - opsPerBucketPerChunk := make([][]int, nbChunks) - for i := 0; i < len(opsPerBucketPerChunk); i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets - } - for chunks := range chOpsPerBucketPerChunk { - for i, opsPerBucket := range chunks { - for j, o := range opsPerBucket { - // bucket j in chunk i has o operations - if opsPerBucketPerChunk[i][j] == 0 && o != 0 { - chunkStats[i].nonZeroBuckets++ - } - opsPerBucketPerChunk[i][j] += o - } - } - } - abs := func(v int) int { if v < 0 { return -v @@ -694,31 +728,67 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks return v } - // we know the total ops for the chunk, the number of non zero buckets - // so we can compute the deviation; - // TODO @gbotrel do that in go routines - for chunkID := 0; chunkID < len(chunkStats); chunkID++ { - nz := chunkStats[chunkID].nonZeroBuckets - if nz == 0 { - continue // ignore chunk, full of zeroes. - } - mean := opsPerChunk[chunkID] / nz - aad := 0 - averageOpsPerBucket := 0 - for _, bucketOps := range opsPerBucketPerChunk[chunkID] { - aad += abs(bucketOps - mean) - averageOpsPerBucket += bucketOps + // aggregate chunk stats + chunkStats := make([]chunkStat, nbChunks) + parallel.Execute(len(chunkStats), func(start, end int) { + // for each chunk compute the statistics + for chunkID := start; chunkID < end; chunkID++ { + var opsPerBucket [1 << 15]int // max value is 16 for c + // digits for the chunk + chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] + + totalOps := 0 + nz := 0 // non zero buckets count + for _, digit := range chunkDigits { + if digit == 0 { + continue + } + totalOps++ + bucketID := digit >> 1 + if digit&1 == 0 { + bucketID -= 1 + } + if opsPerBucket[bucketID] == 0 { + nz++ + } + opsPerBucket[bucketID]++ + } + chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after + chunkStats[chunkID].nonZeroBuckets = nz + + if nz == 0 { + return // no ops, only zeroes + } + + bound := 1 << (c - 1) + if chunkID == int(nbChunks-1) { + bound = 1 << (lastC(c) - 1) + } + mean := totalOps / nz + aad := 0 + averageOpsPerBucket := 0 + for b := 0; b < bound; b++ { + if opsPerBucket[b] == 0 { + continue + } + aad += abs(opsPerBucket[b] - mean) + averageOpsPerBucket += opsPerBucket[b] + } + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + }, nbTasks) + + totalOps := 0 + for _, stat := range chunkStats { + totalOps += stat.weight } target := totalOps / int(nbChunks) - // what percentage are you of the target if target != 0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = opsPerChunk[i] * 100 / target + chunkStats[i].weight = (chunkStats[i].weight * 100) / target } } diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index d6a509fc82..a6fcb5a2f8 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -36,7 +36,7 @@ func (o batchOpG1Affine) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( +func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( chunk uint64, chRes chan<- g1JacExtended, c uint64, @@ -45,8 +45,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af // init the buckets var buckets B + var bucketsJE BJE for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() + bucketsJE[i].setInfinity() } // setup for the batch affine; @@ -71,6 +73,31 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG1Affine) { + // @precondition: ensures bucket is not "used" in current batch + BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } add := func(bucketID uint16, PP *G1Affine, isAdd bool) { // @precondition: ensures bucket is not "used" in current batch @@ -115,12 +142,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af cptAdd++ } + flushQueue := func() { + for i := 0; i < qID; i++ { + bucketsJE[queue[i].bucketID].addMixed(&queue[i].point) + } + qID = 0 + } + processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { continue } - add(queue[i].bucketID, &queue[i].point, true) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -154,8 +188,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af // queue is full, flush it. if qID == len(queue)-1 { - executeAndReset() - processQueue() + flushQueue() } continue } @@ -164,15 +197,16 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() + processQueue() // TODO top queue only } } // empty the queue - for qID != 0 { - processQueue() - executeAndReset() - } + flushQueue() + // for qID != 0 { + // processQueue() + // executeAndReset() + // } // flush items in batch. executeAndReset() @@ -316,7 +350,7 @@ func (o batchOpG2Affine) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( +func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( chunk uint64, chRes chan<- g2JacExtended, c uint64, @@ -325,8 +359,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af // init the buckets var buckets B + var bucketsJE BJE for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() + bucketsJE[i].setInfinity() } // setup for the batch affine; @@ -351,6 +387,31 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG2Affine) { + // @precondition: ensures bucket is not "used" in current batch + BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } add := func(bucketID uint16, PP *G2Affine, isAdd bool) { // @precondition: ensures bucket is not "used" in current batch @@ -395,12 +456,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af cptAdd++ } + flushQueue := func() { + for i := 0; i < qID; i++ { + bucketsJE[queue[i].bucketID].addMixed(&queue[i].point) + } + qID = 0 + } + processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { continue } - add(queue[i].bucketID, &queue[i].point, true) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -434,8 +502,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af // queue is full, flush it. if qID == len(queue)-1 { - executeAndReset() - processQueue() + flushQueue() } continue } @@ -444,15 +511,16 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() + processQueue() // TODO top queue only } } // empty the queue - for qID != 0 { - processQueue() - executeAndReset() - } + flushQueue() + // for qID != 0 { + // processQueue() + // executeAndReset() + // } // flush items in batch. executeAndReset() diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index 87caea0886..14af45e5b5 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 15; i <= pow; i++ { + for i := 22; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 15; i <= pow; i++ { + for i := 22; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index 0fe60e8201..0767ef87aa 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -18,6 +18,7 @@ package bn254 import ( "errors" + "fmt" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bn254/fr" "github.com/consensys/gnark-crypto/internal/parallel" @@ -140,66 +141,100 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC5] case 6: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC6] case 7: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC7] case 8: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC8] case 9: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - const batchSize = 80 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC10] - } - return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] + // const batchSize = 80 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC10] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - const batchSize = 150 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC11] - } - return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] + // const batchSize = 150 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC11] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - const batchSize = 200 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC12] - } - return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] + // const batchSize = 200 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC12] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - const batchSize = 350 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC13] - } - return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] + // const batchSize = 350 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC13] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - const batchSize = 400 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC14] - } - return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] + // const batchSize = 400 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC14] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - const batchSize = 500 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC15] - } - return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] + // const batchSize = 500 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC15] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - const batchSize = 640 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC16] - } - return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + // const batchSize = 640 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC16] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: // panic("will not happen c != previous values is not generated by templates") return processChunkG1Jacobian[bucketg1JacExtendedC16] @@ -223,8 +258,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - + // fmt.Printf("\n") + // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { + // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -382,66 +419,100 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC5] case 6: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC6] case 7: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC7] case 8: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC8] case 9: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - const batchSize = 80 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC10] - } - return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] + // const batchSize = 80 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC10] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - const batchSize = 150 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC11] - } - return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] + // const batchSize = 150 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC11] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - const batchSize = 200 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC12] - } - return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] + // const batchSize = 200 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC12] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - const batchSize = 350 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC13] - } - return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] + // const batchSize = 350 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC13] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - const batchSize = 400 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC14] - } - return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] + // const batchSize = 400 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC14] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - const batchSize = 500 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC15] - } - return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] + // const batchSize = 500 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC15] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - const batchSize = 640 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC16] - } - return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + // const batchSize = 640 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC16] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: // panic("will not happen c != previous values is not generated by templates") return processChunkG2Jacobian[bucketg2JacExtendedC16] @@ -465,8 +536,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - + // fmt.Printf("\n") + // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { + // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -558,6 +631,10 @@ type chunkStat struct { averageOpsPerBucket int } +func (c *chunkStat) String() string { + return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -591,15 +668,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - chOpsPerChunk := make(chan []int, nbTasks) - chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) - parallel.Execute(len(scalars), func(start, end int) { - opsPerChunk := make([]int, nbChunks) - opsPerBucketPerChunk := make([][]int, nbChunks) - for i := 0; i < len(opsPerBucketPerChunk); i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets - } for i := start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -643,50 +712,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks var bits uint16 if digit > 0 { bits = uint16(digit) << 1 - opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 - opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits - opsPerChunk[chunk]++ } } - chOpsPerChunk <- opsPerChunk - chOpsPerBucketPerChunk <- opsPerBucketPerChunk - }, nbTasks) - // aggregate chunk stats - chunkStats := make([]chunkStat, nbChunks) - close(chOpsPerChunk) - close(chOpsPerBucketPerChunk) - opsPerChunk := make([]int, nbChunks) - totalOps := 0 - for chunks := range chOpsPerChunk { - for i, nbOps := range chunks { - opsPerChunk[i] += nbOps - totalOps += nbOps - } - } - - opsPerBucketPerChunk := make([][]int, nbChunks) - for i := 0; i < len(opsPerBucketPerChunk); i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets - } - for chunks := range chOpsPerBucketPerChunk { - for i, opsPerBucket := range chunks { - for j, o := range opsPerBucket { - // bucket j in chunk i has o operations - if opsPerBucketPerChunk[i][j] == 0 && o != 0 { - chunkStats[i].nonZeroBuckets++ - } - opsPerBucketPerChunk[i][j] += o - } - } - } - abs := func(v int) int { if v < 0 { return -v @@ -694,31 +728,67 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks return v } - // we know the total ops for the chunk, the number of non zero buckets - // so we can compute the deviation; - // TODO @gbotrel do that in go routines - for chunkID := 0; chunkID < len(chunkStats); chunkID++ { - nz := chunkStats[chunkID].nonZeroBuckets - if nz == 0 { - continue // ignore chunk, full of zeroes. - } - mean := opsPerChunk[chunkID] / nz - aad := 0 - averageOpsPerBucket := 0 - for _, bucketOps := range opsPerBucketPerChunk[chunkID] { - aad += abs(bucketOps - mean) - averageOpsPerBucket += bucketOps + // aggregate chunk stats + chunkStats := make([]chunkStat, nbChunks) + parallel.Execute(len(chunkStats), func(start, end int) { + // for each chunk compute the statistics + for chunkID := start; chunkID < end; chunkID++ { + var opsPerBucket [1 << 15]int // max value is 16 for c + // digits for the chunk + chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] + + totalOps := 0 + nz := 0 // non zero buckets count + for _, digit := range chunkDigits { + if digit == 0 { + continue + } + totalOps++ + bucketID := digit >> 1 + if digit&1 == 0 { + bucketID -= 1 + } + if opsPerBucket[bucketID] == 0 { + nz++ + } + opsPerBucket[bucketID]++ + } + chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after + chunkStats[chunkID].nonZeroBuckets = nz + + if nz == 0 { + return // no ops, only zeroes + } + + bound := 1 << (c - 1) + if chunkID == int(nbChunks-1) { + bound = 1 << (lastC(c) - 1) + } + mean := totalOps / nz + aad := 0 + averageOpsPerBucket := 0 + for b := 0; b < bound; b++ { + if opsPerBucket[b] == 0 { + continue + } + aad += abs(opsPerBucket[b] - mean) + averageOpsPerBucket += opsPerBucket[b] + } + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + }, nbTasks) + + totalOps := 0 + for _, stat := range chunkStats { + totalOps += stat.weight } target := totalOps / int(nbChunks) - // what percentage are you of the target if target != 0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = opsPerChunk[i] * 100 / target + chunkStats[i].weight = (chunkStats[i].weight * 100) / target } } diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index 939a1b71f2..044f6834fb 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -36,7 +36,7 @@ func (o batchOpG1Affine) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( +func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( chunk uint64, chRes chan<- g1JacExtended, c uint64, @@ -45,8 +45,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af // init the buckets var buckets B + var bucketsJE BJE for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() + bucketsJE[i].setInfinity() } // setup for the batch affine; @@ -71,6 +73,31 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG1Affine) { + // @precondition: ensures bucket is not "used" in current batch + BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } add := func(bucketID uint16, PP *G1Affine, isAdd bool) { // @precondition: ensures bucket is not "used" in current batch @@ -115,12 +142,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af cptAdd++ } + flushQueue := func() { + for i := 0; i < qID; i++ { + bucketsJE[queue[i].bucketID].addMixed(&queue[i].point) + } + qID = 0 + } + processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { continue } - add(queue[i].bucketID, &queue[i].point, true) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -154,8 +188,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af // queue is full, flush it. if qID == len(queue)-1 { - executeAndReset() - processQueue() + flushQueue() } continue } @@ -164,15 +197,16 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() + processQueue() // TODO top queue only } } // empty the queue - for qID != 0 { - processQueue() - executeAndReset() - } + flushQueue() + // for qID != 0 { + // processQueue() + // executeAndReset() + // } // flush items in batch. executeAndReset() @@ -316,7 +350,7 @@ func (o batchOpG2Affine) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( +func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( chunk uint64, chRes chan<- g2JacExtended, c uint64, @@ -325,8 +359,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af // init the buckets var buckets B + var bucketsJE BJE for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() + bucketsJE[i].setInfinity() } // setup for the batch affine; @@ -351,6 +387,31 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG2Affine) { + // @precondition: ensures bucket is not "used" in current batch + BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } add := func(bucketID uint16, PP *G2Affine, isAdd bool) { // @precondition: ensures bucket is not "used" in current batch @@ -395,12 +456,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af cptAdd++ } + flushQueue := func() { + for i := 0; i < qID; i++ { + bucketsJE[queue[i].bucketID].addMixed(&queue[i].point) + } + qID = 0 + } + processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { continue } - add(queue[i].bucketID, &queue[i].point, true) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -434,8 +502,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af // queue is full, flush it. if qID == len(queue)-1 { - executeAndReset() - processQueue() + flushQueue() } continue } @@ -444,15 +511,16 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() + processQueue() // TODO top queue only } } // empty the queue - for qID != 0 { - processQueue() - executeAndReset() - } + flushQueue() + // for qID != 0 { + // processQueue() + // executeAndReset() + // } // flush items in batch. executeAndReset() diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index 378662496e..b3a812b962 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 15; i <= pow; i++ { + for i := 22; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 15; i <= pow; i++ { + for i := 22; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index c8d0d6518e..6bc12d487a 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -18,6 +18,7 @@ package bw6633 import ( "errors" + "fmt" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-633/fr" "github.com/consensys/gnark-crypto/internal/parallel" @@ -140,18 +141,25 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC5] case 8: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC8] case 16: - const batchSize = 640 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC16] - } - return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + // const batchSize = 640 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC16] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: // panic("will not happen c != previous values is not generated by templates") return processChunkG1Jacobian[bucketg1JacExtendedC16] @@ -175,8 +183,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - + // fmt.Printf("\n") + // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { + // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -334,18 +344,25 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC5] case 8: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC8] case 16: - const batchSize = 640 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC16] - } - return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + // const batchSize = 640 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC16] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: // panic("will not happen c != previous values is not generated by templates") return processChunkG2Jacobian[bucketg2JacExtendedC16] @@ -369,8 +386,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - + // fmt.Printf("\n") + // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { + // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -462,6 +481,10 @@ type chunkStat struct { averageOpsPerBucket int } +func (c *chunkStat) String() string { + return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -495,15 +518,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - chOpsPerChunk := make(chan []int, nbTasks) - chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) - parallel.Execute(len(scalars), func(start, end int) { - opsPerChunk := make([]int, nbChunks) - opsPerBucketPerChunk := make([][]int, nbChunks) - for i := 0; i < len(opsPerBucketPerChunk); i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets - } for i := start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -547,50 +562,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks var bits uint16 if digit > 0 { bits = uint16(digit) << 1 - opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 - opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits - opsPerChunk[chunk]++ } } - chOpsPerChunk <- opsPerChunk - chOpsPerBucketPerChunk <- opsPerBucketPerChunk - }, nbTasks) - // aggregate chunk stats - chunkStats := make([]chunkStat, nbChunks) - close(chOpsPerChunk) - close(chOpsPerBucketPerChunk) - opsPerChunk := make([]int, nbChunks) - totalOps := 0 - for chunks := range chOpsPerChunk { - for i, nbOps := range chunks { - opsPerChunk[i] += nbOps - totalOps += nbOps - } - } - - opsPerBucketPerChunk := make([][]int, nbChunks) - for i := 0; i < len(opsPerBucketPerChunk); i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets - } - for chunks := range chOpsPerBucketPerChunk { - for i, opsPerBucket := range chunks { - for j, o := range opsPerBucket { - // bucket j in chunk i has o operations - if opsPerBucketPerChunk[i][j] == 0 && o != 0 { - chunkStats[i].nonZeroBuckets++ - } - opsPerBucketPerChunk[i][j] += o - } - } - } - abs := func(v int) int { if v < 0 { return -v @@ -598,31 +578,67 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks return v } - // we know the total ops for the chunk, the number of non zero buckets - // so we can compute the deviation; - // TODO @gbotrel do that in go routines - for chunkID := 0; chunkID < len(chunkStats); chunkID++ { - nz := chunkStats[chunkID].nonZeroBuckets - if nz == 0 { - continue // ignore chunk, full of zeroes. - } - mean := opsPerChunk[chunkID] / nz - aad := 0 - averageOpsPerBucket := 0 - for _, bucketOps := range opsPerBucketPerChunk[chunkID] { - aad += abs(bucketOps - mean) - averageOpsPerBucket += bucketOps + // aggregate chunk stats + chunkStats := make([]chunkStat, nbChunks) + parallel.Execute(len(chunkStats), func(start, end int) { + // for each chunk compute the statistics + for chunkID := start; chunkID < end; chunkID++ { + var opsPerBucket [1 << 15]int // max value is 16 for c + // digits for the chunk + chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] + + totalOps := 0 + nz := 0 // non zero buckets count + for _, digit := range chunkDigits { + if digit == 0 { + continue + } + totalOps++ + bucketID := digit >> 1 + if digit&1 == 0 { + bucketID -= 1 + } + if opsPerBucket[bucketID] == 0 { + nz++ + } + opsPerBucket[bucketID]++ + } + chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after + chunkStats[chunkID].nonZeroBuckets = nz + + if nz == 0 { + return // no ops, only zeroes + } + + bound := 1 << (c - 1) + if chunkID == int(nbChunks-1) { + bound = 1 << (lastC(c) - 1) + } + mean := totalOps / nz + aad := 0 + averageOpsPerBucket := 0 + for b := 0; b < bound; b++ { + if opsPerBucket[b] == 0 { + continue + } + aad += abs(opsPerBucket[b] - mean) + averageOpsPerBucket += opsPerBucket[b] + } + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + }, nbTasks) + + totalOps := 0 + for _, stat := range chunkStats { + totalOps += stat.weight } target := totalOps / int(nbChunks) - // what percentage are you of the target if target != 0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = opsPerChunk[i] * 100 / target + chunkStats[i].weight = (chunkStats[i].weight * 100) / target } } diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index f3c51b51bf..1de033659f 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -35,7 +35,7 @@ func (o batchOpG1Affine) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( +func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( chunk uint64, chRes chan<- g1JacExtended, c uint64, @@ -44,8 +44,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af // init the buckets var buckets B + var bucketsJE BJE for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() + bucketsJE[i].setInfinity() } // setup for the batch affine; @@ -70,6 +72,31 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG1Affine) { + // @precondition: ensures bucket is not "used" in current batch + BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } add := func(bucketID uint16, PP *G1Affine, isAdd bool) { // @precondition: ensures bucket is not "used" in current batch @@ -114,12 +141,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af cptAdd++ } + flushQueue := func() { + for i := 0; i < qID; i++ { + bucketsJE[queue[i].bucketID].addMixed(&queue[i].point) + } + qID = 0 + } + processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { continue } - add(queue[i].bucketID, &queue[i].point, true) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -153,8 +187,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af // queue is full, flush it. if qID == len(queue)-1 { - executeAndReset() - processQueue() + flushQueue() } continue } @@ -163,15 +196,16 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() + processQueue() // TODO top queue only } } // empty the queue - for qID != 0 { - processQueue() - executeAndReset() - } + flushQueue() + // for qID != 0 { + // processQueue() + // executeAndReset() + // } // flush items in batch. executeAndReset() @@ -243,7 +277,7 @@ func (o batchOpG2Affine) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( +func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( chunk uint64, chRes chan<- g2JacExtended, c uint64, @@ -252,8 +286,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af // init the buckets var buckets B + var bucketsJE BJE for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() + bucketsJE[i].setInfinity() } // setup for the batch affine; @@ -278,6 +314,31 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG2Affine) { + // @precondition: ensures bucket is not "used" in current batch + BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } add := func(bucketID uint16, PP *G2Affine, isAdd bool) { // @precondition: ensures bucket is not "used" in current batch @@ -322,12 +383,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af cptAdd++ } + flushQueue := func() { + for i := 0; i < qID; i++ { + bucketsJE[queue[i].bucketID].addMixed(&queue[i].point) + } + qID = 0 + } + processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { continue } - add(queue[i].bucketID, &queue[i].point, true) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -361,8 +429,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af // queue is full, flush it. if qID == len(queue)-1 { - executeAndReset() - processQueue() + flushQueue() } continue } @@ -371,15 +438,16 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() + processQueue() // TODO top queue only } } // empty the queue - for qID != 0 { - processQueue() - executeAndReset() - } + flushQueue() + // for qID != 0 { + // processQueue() + // executeAndReset() + // } // flush items in batch. executeAndReset() diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index 622c98195b..95312917d9 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 15; i <= pow; i++ { + for i := 22; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 15; i <= pow; i++ { + for i := 22; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index 72a0b35b7b..951ee4bd04 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -18,6 +18,7 @@ package bw6756 import ( "errors" + "fmt" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-756/fr" "github.com/consensys/gnark-crypto/internal/parallel" @@ -140,18 +141,25 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC5] case 8: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC8] case 16: - const batchSize = 640 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC16] - } - return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + // const batchSize = 640 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC16] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: // panic("will not happen c != previous values is not generated by templates") return processChunkG1Jacobian[bucketg1JacExtendedC16] @@ -175,8 +183,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - + // fmt.Printf("\n") + // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { + // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -334,18 +344,25 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC5] case 8: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC8] case 16: - const batchSize = 640 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC16] - } - return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + // const batchSize = 640 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC16] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: // panic("will not happen c != previous values is not generated by templates") return processChunkG2Jacobian[bucketg2JacExtendedC16] @@ -369,8 +386,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - + // fmt.Printf("\n") + // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { + // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -462,6 +481,10 @@ type chunkStat struct { averageOpsPerBucket int } +func (c *chunkStat) String() string { + return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -495,15 +518,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - chOpsPerChunk := make(chan []int, nbTasks) - chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) - parallel.Execute(len(scalars), func(start, end int) { - opsPerChunk := make([]int, nbChunks) - opsPerBucketPerChunk := make([][]int, nbChunks) - for i := 0; i < len(opsPerBucketPerChunk); i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets - } for i := start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -547,50 +562,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks var bits uint16 if digit > 0 { bits = uint16(digit) << 1 - opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 - opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits - opsPerChunk[chunk]++ } } - chOpsPerChunk <- opsPerChunk - chOpsPerBucketPerChunk <- opsPerBucketPerChunk - }, nbTasks) - // aggregate chunk stats - chunkStats := make([]chunkStat, nbChunks) - close(chOpsPerChunk) - close(chOpsPerBucketPerChunk) - opsPerChunk := make([]int, nbChunks) - totalOps := 0 - for chunks := range chOpsPerChunk { - for i, nbOps := range chunks { - opsPerChunk[i] += nbOps - totalOps += nbOps - } - } - - opsPerBucketPerChunk := make([][]int, nbChunks) - for i := 0; i < len(opsPerBucketPerChunk); i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets - } - for chunks := range chOpsPerBucketPerChunk { - for i, opsPerBucket := range chunks { - for j, o := range opsPerBucket { - // bucket j in chunk i has o operations - if opsPerBucketPerChunk[i][j] == 0 && o != 0 { - chunkStats[i].nonZeroBuckets++ - } - opsPerBucketPerChunk[i][j] += o - } - } - } - abs := func(v int) int { if v < 0 { return -v @@ -598,31 +578,67 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks return v } - // we know the total ops for the chunk, the number of non zero buckets - // so we can compute the deviation; - // TODO @gbotrel do that in go routines - for chunkID := 0; chunkID < len(chunkStats); chunkID++ { - nz := chunkStats[chunkID].nonZeroBuckets - if nz == 0 { - continue // ignore chunk, full of zeroes. - } - mean := opsPerChunk[chunkID] / nz - aad := 0 - averageOpsPerBucket := 0 - for _, bucketOps := range opsPerBucketPerChunk[chunkID] { - aad += abs(bucketOps - mean) - averageOpsPerBucket += bucketOps + // aggregate chunk stats + chunkStats := make([]chunkStat, nbChunks) + parallel.Execute(len(chunkStats), func(start, end int) { + // for each chunk compute the statistics + for chunkID := start; chunkID < end; chunkID++ { + var opsPerBucket [1 << 15]int // max value is 16 for c + // digits for the chunk + chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] + + totalOps := 0 + nz := 0 // non zero buckets count + for _, digit := range chunkDigits { + if digit == 0 { + continue + } + totalOps++ + bucketID := digit >> 1 + if digit&1 == 0 { + bucketID -= 1 + } + if opsPerBucket[bucketID] == 0 { + nz++ + } + opsPerBucket[bucketID]++ + } + chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after + chunkStats[chunkID].nonZeroBuckets = nz + + if nz == 0 { + return // no ops, only zeroes + } + + bound := 1 << (c - 1) + if chunkID == int(nbChunks-1) { + bound = 1 << (lastC(c) - 1) + } + mean := totalOps / nz + aad := 0 + averageOpsPerBucket := 0 + for b := 0; b < bound; b++ { + if opsPerBucket[b] == 0 { + continue + } + aad += abs(opsPerBucket[b] - mean) + averageOpsPerBucket += opsPerBucket[b] + } + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + }, nbTasks) + + totalOps := 0 + for _, stat := range chunkStats { + totalOps += stat.weight } target := totalOps / int(nbChunks) - // what percentage are you of the target if target != 0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = opsPerChunk[i] * 100 / target + chunkStats[i].weight = (chunkStats[i].weight * 100) / target } } diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index 0925a40c35..5b99051b82 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -35,7 +35,7 @@ func (o batchOpG1Affine) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( +func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( chunk uint64, chRes chan<- g1JacExtended, c uint64, @@ -44,8 +44,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af // init the buckets var buckets B + var bucketsJE BJE for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() + bucketsJE[i].setInfinity() } // setup for the batch affine; @@ -70,6 +72,31 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG1Affine) { + // @precondition: ensures bucket is not "used" in current batch + BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } add := func(bucketID uint16, PP *G1Affine, isAdd bool) { // @precondition: ensures bucket is not "used" in current batch @@ -114,12 +141,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af cptAdd++ } + flushQueue := func() { + for i := 0; i < qID; i++ { + bucketsJE[queue[i].bucketID].addMixed(&queue[i].point) + } + qID = 0 + } + processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { continue } - add(queue[i].bucketID, &queue[i].point, true) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -153,8 +187,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af // queue is full, flush it. if qID == len(queue)-1 { - executeAndReset() - processQueue() + flushQueue() } continue } @@ -163,15 +196,16 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() + processQueue() // TODO top queue only } } // empty the queue - for qID != 0 { - processQueue() - executeAndReset() - } + flushQueue() + // for qID != 0 { + // processQueue() + // executeAndReset() + // } // flush items in batch. executeAndReset() @@ -243,7 +277,7 @@ func (o batchOpG2Affine) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( +func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( chunk uint64, chRes chan<- g2JacExtended, c uint64, @@ -252,8 +286,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af // init the buckets var buckets B + var bucketsJE BJE for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() + bucketsJE[i].setInfinity() } // setup for the batch affine; @@ -278,6 +314,31 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG2Affine) { + // @precondition: ensures bucket is not "used" in current batch + BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } add := func(bucketID uint16, PP *G2Affine, isAdd bool) { // @precondition: ensures bucket is not "used" in current batch @@ -322,12 +383,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af cptAdd++ } + flushQueue := func() { + for i := 0; i < qID; i++ { + bucketsJE[queue[i].bucketID].addMixed(&queue[i].point) + } + qID = 0 + } + processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { continue } - add(queue[i].bucketID, &queue[i].point, true) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -361,8 +429,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af // queue is full, flush it. if qID == len(queue)-1 { - executeAndReset() - processQueue() + flushQueue() } continue } @@ -371,15 +438,16 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() + processQueue() // TODO top queue only } } // empty the queue - for qID != 0 { - processQueue() - executeAndReset() - } + flushQueue() + // for qID != 0 { + // processQueue() + // executeAndReset() + // } // flush items in batch. executeAndReset() diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index f7488f5dd7..a94008b836 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 15; i <= pow; i++ { + for i := 22; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 15; i <= pow; i++ { + for i := 22; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index 32920cef38..fe9a1970a4 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -18,6 +18,7 @@ package bw6761 import ( "errors" + "fmt" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" "github.com/consensys/gnark-crypto/internal/parallel" @@ -140,18 +141,25 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC5] case 8: + // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC8] case 16: - const batchSize = 640 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG1Jacobian[bucketg1JacExtendedC16] - } - return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + // const batchSize = 640 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG1Jacobian[bucketg1JacExtendedC16] + // } + // fmt.Printf("affine \n") + return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: // panic("will not happen c != previous values is not generated by templates") return processChunkG1Jacobian[bucketg1JacExtendedC16] @@ -175,8 +183,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - + // fmt.Printf("\n") + // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { + // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -334,18 +344,25 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC5] case 8: + // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC8] case 16: - const batchSize = 640 - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunkG2Jacobian[bucketg2JacExtendedC16] - } - return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + // const batchSize = 640 + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunkG2Jacobian[bucketg2JacExtendedC16] + // } + // fmt.Printf("affine \n") + return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: // panic("will not happen c != previous values is not generated by templates") return processChunkG2Jacobian[bucketg2JacExtendedC16] @@ -369,8 +386,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - + // fmt.Printf("\n") + // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { + // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -462,6 +481,10 @@ type chunkStat struct { averageOpsPerBucket int } +func (c *chunkStat) String() string { + return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. @@ -495,15 +518,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - chOpsPerChunk := make(chan []int, nbTasks) - chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) - parallel.Execute(len(scalars), func(start, end int) { - opsPerChunk := make([]int, nbChunks) - opsPerBucketPerChunk := make([][]int, nbChunks) - for i := 0; i < len(opsPerBucketPerChunk); i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets - } for i := start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -547,50 +562,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks var bits uint16 if digit > 0 { bits = uint16(digit) << 1 - opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 - opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits - opsPerChunk[chunk]++ } } - chOpsPerChunk <- opsPerChunk - chOpsPerBucketPerChunk <- opsPerBucketPerChunk - }, nbTasks) - // aggregate chunk stats - chunkStats := make([]chunkStat, nbChunks) - close(chOpsPerChunk) - close(chOpsPerBucketPerChunk) - opsPerChunk := make([]int, nbChunks) - totalOps := 0 - for chunks := range chOpsPerChunk { - for i, nbOps := range chunks { - opsPerChunk[i] += nbOps - totalOps += nbOps - } - } - - opsPerBucketPerChunk := make([][]int, nbChunks) - for i := 0; i < len(opsPerBucketPerChunk); i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets - } - for chunks := range chOpsPerBucketPerChunk { - for i, opsPerBucket := range chunks { - for j, o := range opsPerBucket { - // bucket j in chunk i has o operations - if opsPerBucketPerChunk[i][j] == 0 && o != 0 { - chunkStats[i].nonZeroBuckets++ - } - opsPerBucketPerChunk[i][j] += o - } - } - } - abs := func(v int) int { if v < 0 { return -v @@ -598,31 +578,67 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks return v } - // we know the total ops for the chunk, the number of non zero buckets - // so we can compute the deviation; - // TODO @gbotrel do that in go routines - for chunkID := 0; chunkID < len(chunkStats); chunkID++ { - nz := chunkStats[chunkID].nonZeroBuckets - if nz == 0 { - continue // ignore chunk, full of zeroes. - } - mean := opsPerChunk[chunkID] / nz - aad := 0 - averageOpsPerBucket := 0 - for _, bucketOps := range opsPerBucketPerChunk[chunkID] { - aad += abs(bucketOps - mean) - averageOpsPerBucket += bucketOps + // aggregate chunk stats + chunkStats := make([]chunkStat, nbChunks) + parallel.Execute(len(chunkStats), func(start, end int) { + // for each chunk compute the statistics + for chunkID := start; chunkID < end; chunkID++ { + var opsPerBucket [1 << 15]int // max value is 16 for c + // digits for the chunk + chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] + + totalOps := 0 + nz := 0 // non zero buckets count + for _, digit := range chunkDigits { + if digit == 0 { + continue + } + totalOps++ + bucketID := digit >> 1 + if digit&1 == 0 { + bucketID -= 1 + } + if opsPerBucket[bucketID] == 0 { + nz++ + } + opsPerBucket[bucketID]++ + } + chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after + chunkStats[chunkID].nonZeroBuckets = nz + + if nz == 0 { + return // no ops, only zeroes + } + + bound := 1 << (c - 1) + if chunkID == int(nbChunks-1) { + bound = 1 << (lastC(c) - 1) + } + mean := totalOps / nz + aad := 0 + averageOpsPerBucket := 0 + for b := 0; b < bound; b++ { + if opsPerBucket[b] == 0 { + continue + } + aad += abs(opsPerBucket[b] - mean) + averageOpsPerBucket += opsPerBucket[b] + } + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + }, nbTasks) + + totalOps := 0 + for _, stat := range chunkStats { + totalOps += stat.weight } target := totalOps / int(nbChunks) - // what percentage are you of the target if target != 0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = opsPerChunk[i] * 100 / target + chunkStats[i].weight = (chunkStats[i].weight * 100) / target } } diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index 6b4ec532ea..885062c223 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -35,7 +35,7 @@ func (o batchOpG1Affine) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( +func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine]( chunk uint64, chRes chan<- g1JacExtended, c uint64, @@ -44,8 +44,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af // init the buckets var buckets B + var bucketsJE BJE for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() + bucketsJE[i].setInfinity() } // setup for the batch affine; @@ -70,6 +72,31 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG1Affine) { + // @precondition: ensures bucket is not "used" in current batch + BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } add := func(bucketID uint16, PP *G1Affine, isAdd bool) { // @precondition: ensures bucket is not "used" in current batch @@ -114,12 +141,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af cptAdd++ } + flushQueue := func() { + for i := 0; i < qID; i++ { + bucketsJE[queue[i].bucketID].addMixed(&queue[i].point) + } + qID = 0 + } + processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { continue } - add(queue[i].bucketID, &queue[i].point, true) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -153,8 +187,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af // queue is full, flush it. if qID == len(queue)-1 { - executeAndReset() - processQueue() + flushQueue() } continue } @@ -163,15 +196,16 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() + processQueue() // TODO top queue only } } // empty the queue - for qID != 0 { - processQueue() - executeAndReset() - } + flushQueue() + // for qID != 0 { + // processQueue() + // executeAndReset() + // } // flush items in batch. executeAndReset() @@ -243,7 +277,7 @@ func (o batchOpG2Affine) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( +func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine]( chunk uint64, chRes chan<- g2JacExtended, c uint64, @@ -252,8 +286,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af // init the buckets var buckets B + var bucketsJE BJE for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() + bucketsJE[i].setInfinity() } // setup for the batch affine; @@ -278,6 +314,31 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG2Affine) { + // @precondition: ensures bucket is not "used" in current batch + BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } add := func(bucketID uint16, PP *G2Affine, isAdd bool) { // @precondition: ensures bucket is not "used" in current batch @@ -322,12 +383,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af cptAdd++ } + flushQueue := func() { + for i := 0; i < qID; i++ { + bucketsJE[queue[i].bucketID].addMixed(&queue[i].point) + } + qID = 0 + } + processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { continue } - add(queue[i].bucketID, &queue[i].point, true) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -361,8 +429,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af // queue is full, flush it. if qID == len(queue)-1 { - executeAndReset() - processQueue() + flushQueue() } continue } @@ -371,15 +438,16 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() + processQueue() // TODO top queue only } } // empty the queue - for qID != 0 { - processQueue() - executeAndReset() - } + flushQueue() + // for qID != 0 { + // processQueue() + // executeAndReset() + // } // flush items in batch. executeAndReset() diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index c7fdcf64cf..7d9abce749 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 15; i <= pow; i++ { + for i := 22; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 15; i <= pow; i++ { + for i := 22; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index a274e26e66..f650f65dbc 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -14,6 +14,7 @@ import ( "errors" "math" "runtime" + "fmt" ) {{ template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}} @@ -69,6 +70,10 @@ type chunkStat struct { averageOpsPerBucket int } +func (c *chunkStat) String() string { + return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) +} + // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract @@ -104,15 +109,8 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks selectors[chunk] = d } - chOpsPerChunk := make(chan []int, nbTasks) - chOpsPerBucketPerChunk := make(chan [][]int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { - opsPerChunk := make([]int, nbChunks) - opsPerBucketPerChunk := make([][]int, nbChunks) - for i:=0; i < len(opsPerBucketPerChunk);i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c-1))) // nbBuckets - } for i:=start; i < end; i++ { scalar := scalars[i] if scalarsMont { @@ -157,85 +155,84 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks var bits uint16 if digit > 0 { bits = uint16(digit) << 1 - opsPerBucketPerChunk[chunk][uint16(digit)]++ } else { bits = (uint16(-digit-1) << 1) + 1 - opsPerBucketPerChunk[chunk][uint16(-digit-1)]++ } digits[int(chunk)*len(scalars)+i] = bits - opsPerChunk[chunk]++ } } - chOpsPerChunk <- opsPerChunk - chOpsPerBucketPerChunk <- opsPerBucketPerChunk - }, nbTasks) + abs := func(v int) int { + if v < 0 { + return -v + } + return v + } // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) - close(chOpsPerChunk) - close(chOpsPerBucketPerChunk) - opsPerChunk := make([]int, nbChunks) - totalOps := 0 - for chunks := range chOpsPerChunk { - for i, nbOps := range chunks { - opsPerChunk[i]+=nbOps - totalOps += nbOps - } - } + parallel.Execute(len(chunkStats), func(start, end int) { + // for each chunk compute the statistics + for chunkID := start; chunkID < end; chunkID++ { + var opsPerBucket [1 << 15]int // max value is 16 for c + // digits for the chunk + chunkDigits := digits[chunkID*len(scalars):(chunkID+1)*len(scalars)] + + totalOps := 0 + nz := 0 // non zero buckets count + for _, digit := range chunkDigits { + if digit == 0 { + continue + } + totalOps++ + bucketID := digit >> 1 + if digit &1 == 0 { + bucketID-=1 + } + if opsPerBucket[bucketID] == 0 { + nz++ + } + opsPerBucket[bucketID]++ + } + chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after + chunkStats[chunkID].nonZeroBuckets = nz + + if nz == 0 { + return // no ops, only zeroes + } - opsPerBucketPerChunk := make([][]int, nbChunks) - for i:=0; i < len(opsPerBucketPerChunk);i++ { - opsPerBucketPerChunk[i] = make([]int, (1 << (c-1))) // nbBuckets - } - for chunks := range chOpsPerBucketPerChunk { - for i, opsPerBucket := range chunks { - for j, o := range opsPerBucket { - // bucket j in chunk i has o operations - if opsPerBucketPerChunk[i][j] == 0 && o != 0 { - chunkStats[i].nonZeroBuckets++ + bound := 1 << (c-1) + if chunkID == int(nbChunks-1) { + bound = 1 << (lastC(c)-1) + } + mean := totalOps / nz + aad := 0 + averageOpsPerBucket := 0 + for b:=0; b < bound; b++ { + if opsPerBucket[b] == 0 { + continue } - opsPerBucketPerChunk[i][j] += o + aad += abs(opsPerBucket[b] - mean) + averageOpsPerBucket += opsPerBucket[b] } - } - } - - abs := func(v int) int { - if v < 0 { - return -v + chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + chunkStats[chunkID].deviation = aad / nz } - return v - } + }, nbTasks) - // we know the total ops for the chunk, the number of non zero buckets - // so we can compute the deviation; - // TODO @gbotrel do that in go routines - for chunkID:=0; chunkID < len(chunkStats); chunkID++ { - nz := chunkStats[chunkID].nonZeroBuckets - if nz == 0 { - continue // ignore chunk, full of zeroes. - } - mean := opsPerChunk[chunkID] / nz - aad := 0 - averageOpsPerBucket := 0 - for _, bucketOps := range opsPerBucketPerChunk[chunkID] { - aad += abs(bucketOps - mean) - averageOpsPerBucket += bucketOps - } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + totalOps := 0 + for _, stat := range chunkStats { + totalOps+=stat.weight } - target := totalOps / int(nbChunks) - // what percentage are you of the target if target != 0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = opsPerChunk[i] * 100 / target + chunkStats[i].weight = (chunkStats[i].weight * 100) / target } } @@ -494,14 +491,19 @@ func getChunkProcessor{{ $.UPointName }}(c uint64, stat chunkStat) func(chunkID {{range $c := $.CRange}} case {{$c}}: {{- if le $c 9}} + // fmt.Printf("jacobian \n") return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] {{- else}} - const batchSize = {{batchSize $c}} - edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3) - if edgeCaseAffine { - return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] - } - return processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}] + // const batchSize = {{batchSize $c}} + // status: we are losing in perf here in the nominal case. + // stat.deviation seems not good. + // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) + // if edgeCaseAffine { + // // fmt.Printf("jacobian \n") + // return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] + // } + // fmt.Printf("affine \n") + return processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TJacobianExtended }}C{{$c}}, bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}] {{- end}} {{- end}} default: @@ -527,8 +529,10 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - + // fmt.Printf("\n") + // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { + // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) processChunk := getChunkProcessor{{ $.UPointName }}(c, chunkStats[j]) if j == int(nbChunks - 1) { processChunk = getChunkProcessor{{ $.UPointName }}(lastC(c), chunkStats[j]) diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index 513b06f96f..1763f28292 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -41,7 +41,7 @@ func (o batchOp{{ $.TAffine }}) isNeg() bool { // // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249 // See Section 5.3: ia.cr/2022/1396 -func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, TP p{{ $.TAffine }}, TPP pp{{ $.TAffine }}, TQ qOps{{ $.TAffine }}, TC c{{ $.TAffine}}]( +func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B ib{{ $.TAffine }}, BS bitSet, TP p{{ $.TAffine }}, TPP pp{{ $.TAffine }}, TQ qOps{{ $.TAffine }}, TC c{{ $.TAffine}}]( chunk uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, @@ -50,8 +50,10 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T // init the buckets var buckets B + var bucketsJE BJE for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() + bucketsJE[i].setInfinity() } // setup for the batch affine; @@ -77,7 +79,31 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOp{{ $.TAffine }}) { + // @precondition: ensures bucket is not "used" in current batch + BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P + if BK.IsInfinity() { + BK.Set(&op.point) + return + } + if BK.X.Equal(&op.point.X) { + if BK.Y.Equal(&op.point.Y) { + // P + P: doubling, which should be quite rare -- + // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. + // need doubling in affine implemented ? + BK.Add(BK, BK) + return + } + BK.setInfinity() + return + } + bucketIds[op.bucketID] = true + R[cptAdd] = BK + P[cptAdd] = op.point + cptAdd++ + } add := func(bucketID uint16, PP *{{$.TAffine}}, isAdd bool) { // @precondition: ensures bucket is not "used" in current batch @@ -122,13 +148,19 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T cptAdd++ } + flushQueue := func () { + for i:=0; i < qID; i++ { + bucketsJE[queue[i].bucketID].addMixed(&queue[i].point) + } + qID = 0 + } processQueue := func () { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { continue } - add(queue[i].bucketID, &queue[i].point, true) + addFromQueue(queue[i]) if isFull() { executeAndReset() } @@ -163,8 +195,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T // queue is full, flush it. if qID == len(queue) - 1 { - executeAndReset() - processQueue() + flushQueue() } continue } @@ -173,15 +204,16 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() + processQueue() // TODO top queue only } } // empty the queue - for qID != 0 { - processQueue() - executeAndReset() - } + flushQueue() + // for qID != 0 { + // processQueue() + // executeAndReset() + // } // flush items in batch. executeAndReset() diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index f208891b47..91af5a996b 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -267,7 +267,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { var testPoint {{ $.TAffine }} - for i := 15; i <= pow; i++ { + for i := 22; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { From fcdcbfdcf58800025a9968b08a0ef46e3a0bbdac Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Wed, 16 Nov 2022 07:28:14 -0600 Subject: [PATCH 29/43] checkpoint --- ecc/bls12-377/multiexp.go | 76 ++++++++-------- ecc/bls12-377/multiexp_affine.go | 86 +++++++++++++------ ecc/bls12-378/multiexp.go | 76 ++++++++-------- ecc/bls12-378/multiexp_affine.go | 86 +++++++++++++------ ecc/bls12-381/multiexp.go | 76 ++++++++-------- ecc/bls12-381/multiexp_affine.go | 86 +++++++++++++------ ecc/bls24-315/multiexp.go | 76 ++++++++-------- ecc/bls24-315/multiexp_affine.go | 86 +++++++++++++------ ecc/bls24-317/multiexp.go | 76 ++++++++-------- ecc/bls24-317/multiexp_affine.go | 86 +++++++++++++------ ecc/bn254/multiexp.go | 76 ++++++++-------- ecc/bn254/multiexp_affine.go | 86 +++++++++++++------ ecc/bw6-633/multiexp.go | 76 ++++++++-------- ecc/bw6-633/multiexp_affine.go | 86 +++++++++++++------ ecc/bw6-756/multiexp.go | 76 ++++++++-------- ecc/bw6-756/multiexp_affine.go | 86 +++++++++++++------ ecc/bw6-761/multiexp.go | 76 ++++++++-------- ecc/bw6-761/multiexp_affine.go | 86 +++++++++++++------ .../generator/ecc/template/multiexp.go.tmpl | 74 ++++++++-------- .../ecc/template/multiexp_affine.go.tmpl | 43 +++++++--- 20 files changed, 931 insertions(+), 644 deletions(-) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index b0c3c27e36..bc85684256 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -721,61 +721,61 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - abs := func(v int) int { - if v < 0 { - return -v - } - return v - } + // abs := func(v int) int { + // if v < 0 { + // return -v + // } + // return v + // } // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - var opsPerBucket [1 << 15]int // max value is 16 for c + // var opsPerBucket [1 << 15]int // max value is 16 for c // digits for the chunk chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] totalOps := 0 - nz := 0 // non zero buckets count + // nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - bucketID := digit >> 1 - if digit&1 == 0 { - bucketID -= 1 - } - if opsPerBucket[bucketID] == 0 { - nz++ - } - opsPerBucket[bucketID]++ + // bucketID := digit >> 1 + // if digit &1 == 0 { + // bucketID-=1 + // } + // if opsPerBucket[bucketID] == 0 { + // nz++ + // } + // opsPerBucket[bucketID]++ } chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - chunkStats[chunkID].nonZeroBuckets = nz - - if nz == 0 { - return // no ops, only zeroes - } - - bound := 1 << (c - 1) - if chunkID == int(nbChunks-1) { - bound = 1 << (lastC(c) - 1) - } - mean := totalOps / nz - aad := 0 - averageOpsPerBucket := 0 - for b := 0; b < bound; b++ { - if opsPerBucket[b] == 0 { - continue - } - aad += abs(opsPerBucket[b] - mean) - averageOpsPerBucket += opsPerBucket[b] - } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + // chunkStats[chunkID].nonZeroBuckets = nz + + // if nz == 0 { + // return // no ops, only zeroes + // } + + // bound := 1 << (c-1) + // if chunkID == int(nbChunks-1) { + // bound = 1 << (lastC(c)-1) + // } + // mean := totalOps / nz + // aad := 0 + // averageOpsPerBucket := 0 + // for b:=0; b < bound; b++ { + // if opsPerBucket[b] == 0 { + // continue + // } + // aad += abs(opsPerBucket[b] - mean) + // averageOpsPerBucket += opsPerBucket[b] + // } + // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + // chunkStats[chunkID].deviation = aad / nz } }, nbTasks) diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index e953a6f079..b2c167e9bc 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -23,11 +23,13 @@ import ( type batchOpG1Affine struct { bucketID uint16 - point G1Affine + // pointID uint32 + point G1Affine } func (o batchOpG1Affine) isNeg() bool { - return o.bucketID&1 == 1 + return false + // return o.pointID&1 == 1 } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -46,6 +48,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // init the buckets var buckets B var bucketsJE BJE + var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -60,6 +63,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) + // var queue [batchSize]batchOpG1Affine batchSize := len(P) @@ -77,7 +81,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[op.bucketID] { + bucketSet[op.bucketID] = true BK.Set(&op.point) return } @@ -90,6 +95,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP return } BK.setInfinity() + bucketSet[op.bucketID] = false return } @@ -103,12 +109,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[bucketID] { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } + bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -120,12 +127,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() + bucketSet[bucketID] = false } return } if isAdd { BK.setInfinity() + bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -152,13 +161,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { - continue + return } addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] + // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) + // if isFull() { + // executeAndReset() + // } + // queue[i] = queue[qID-1] qID-- } } @@ -180,8 +190,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - queue[qID].point = points[i] + // queue[qID].pointID = uint32(i << 1) + queue[qID].point.Set(&points[i]) } else { + // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -201,6 +213,9 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP } } + // flush items in batch. + executeAndReset() + // empty the queue flushQueue() // for qID != 0 { @@ -208,9 +223,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // executeAndReset() // } - // flush items in batch. - executeAndReset() - // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -218,9 +230,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { + if bucketSet[k] { runningSum.addMixed(&buckets[k]) } + if !bucketsJE[k].ZZ.IsZero() { + runningSum.add(&bucketsJE[k]) + } total.add(&runningSum) } @@ -337,11 +352,13 @@ type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 - point G2Affine + // pointID uint32 + point G2Affine } func (o batchOpG2Affine) isNeg() bool { - return o.bucketID&1 == 1 + return false + // return o.pointID&1 == 1 } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -360,6 +377,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // init the buckets var buckets B var bucketsJE BJE + var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -374,6 +392,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) + // var queue [batchSize]batchOpG2Affine batchSize := len(P) @@ -391,7 +410,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[op.bucketID] { + bucketSet[op.bucketID] = true BK.Set(&op.point) return } @@ -404,6 +424,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP return } BK.setInfinity() + bucketSet[op.bucketID] = false return } @@ -417,12 +438,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[bucketID] { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } + bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -434,12 +456,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() + bucketSet[bucketID] = false } return } if isAdd { BK.setInfinity() + bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -466,13 +490,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { - continue + return } addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] + // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) + // if isFull() { + // executeAndReset() + // } + // queue[i] = queue[qID-1] qID-- } } @@ -494,8 +519,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - queue[qID].point = points[i] + // queue[qID].pointID = uint32(i << 1) + queue[qID].point.Set(&points[i]) } else { + // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -515,6 +542,9 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP } } + // flush items in batch. + executeAndReset() + // empty the queue flushQueue() // for qID != 0 { @@ -522,9 +552,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // executeAndReset() // } - // flush items in batch. - executeAndReset() - // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -532,9 +559,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { + if bucketSet[k] { runningSum.addMixed(&buckets[k]) } + if !bucketsJE[k].ZZ.IsZero() { + runningSum.add(&bucketsJE[k]) + } total.add(&runningSum) } diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index 5833c4e009..7164359d9f 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -721,61 +721,61 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - abs := func(v int) int { - if v < 0 { - return -v - } - return v - } + // abs := func(v int) int { + // if v < 0 { + // return -v + // } + // return v + // } // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - var opsPerBucket [1 << 15]int // max value is 16 for c + // var opsPerBucket [1 << 15]int // max value is 16 for c // digits for the chunk chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] totalOps := 0 - nz := 0 // non zero buckets count + // nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - bucketID := digit >> 1 - if digit&1 == 0 { - bucketID -= 1 - } - if opsPerBucket[bucketID] == 0 { - nz++ - } - opsPerBucket[bucketID]++ + // bucketID := digit >> 1 + // if digit &1 == 0 { + // bucketID-=1 + // } + // if opsPerBucket[bucketID] == 0 { + // nz++ + // } + // opsPerBucket[bucketID]++ } chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - chunkStats[chunkID].nonZeroBuckets = nz - - if nz == 0 { - return // no ops, only zeroes - } - - bound := 1 << (c - 1) - if chunkID == int(nbChunks-1) { - bound = 1 << (lastC(c) - 1) - } - mean := totalOps / nz - aad := 0 - averageOpsPerBucket := 0 - for b := 0; b < bound; b++ { - if opsPerBucket[b] == 0 { - continue - } - aad += abs(opsPerBucket[b] - mean) - averageOpsPerBucket += opsPerBucket[b] - } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + // chunkStats[chunkID].nonZeroBuckets = nz + + // if nz == 0 { + // return // no ops, only zeroes + // } + + // bound := 1 << (c-1) + // if chunkID == int(nbChunks-1) { + // bound = 1 << (lastC(c)-1) + // } + // mean := totalOps / nz + // aad := 0 + // averageOpsPerBucket := 0 + // for b:=0; b < bound; b++ { + // if opsPerBucket[b] == 0 { + // continue + // } + // aad += abs(opsPerBucket[b] - mean) + // averageOpsPerBucket += opsPerBucket[b] + // } + // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + // chunkStats[chunkID].deviation = aad / nz } }, nbTasks) diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index 533cb7304c..af73df4882 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -23,11 +23,13 @@ import ( type batchOpG1Affine struct { bucketID uint16 - point G1Affine + // pointID uint32 + point G1Affine } func (o batchOpG1Affine) isNeg() bool { - return o.bucketID&1 == 1 + return false + // return o.pointID&1 == 1 } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -46,6 +48,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // init the buckets var buckets B var bucketsJE BJE + var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -60,6 +63,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) + // var queue [batchSize]batchOpG1Affine batchSize := len(P) @@ -77,7 +81,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[op.bucketID] { + bucketSet[op.bucketID] = true BK.Set(&op.point) return } @@ -90,6 +95,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP return } BK.setInfinity() + bucketSet[op.bucketID] = false return } @@ -103,12 +109,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[bucketID] { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } + bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -120,12 +127,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() + bucketSet[bucketID] = false } return } if isAdd { BK.setInfinity() + bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -152,13 +161,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { - continue + return } addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] + // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) + // if isFull() { + // executeAndReset() + // } + // queue[i] = queue[qID-1] qID-- } } @@ -180,8 +190,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - queue[qID].point = points[i] + // queue[qID].pointID = uint32(i << 1) + queue[qID].point.Set(&points[i]) } else { + // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -201,6 +213,9 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP } } + // flush items in batch. + executeAndReset() + // empty the queue flushQueue() // for qID != 0 { @@ -208,9 +223,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // executeAndReset() // } - // flush items in batch. - executeAndReset() - // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -218,9 +230,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { + if bucketSet[k] { runningSum.addMixed(&buckets[k]) } + if !bucketsJE[k].ZZ.IsZero() { + runningSum.add(&bucketsJE[k]) + } total.add(&runningSum) } @@ -337,11 +352,13 @@ type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 - point G2Affine + // pointID uint32 + point G2Affine } func (o batchOpG2Affine) isNeg() bool { - return o.bucketID&1 == 1 + return false + // return o.pointID&1 == 1 } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -360,6 +377,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // init the buckets var buckets B var bucketsJE BJE + var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -374,6 +392,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) + // var queue [batchSize]batchOpG2Affine batchSize := len(P) @@ -391,7 +410,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[op.bucketID] { + bucketSet[op.bucketID] = true BK.Set(&op.point) return } @@ -404,6 +424,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP return } BK.setInfinity() + bucketSet[op.bucketID] = false return } @@ -417,12 +438,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[bucketID] { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } + bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -434,12 +456,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() + bucketSet[bucketID] = false } return } if isAdd { BK.setInfinity() + bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -466,13 +490,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { - continue + return } addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] + // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) + // if isFull() { + // executeAndReset() + // } + // queue[i] = queue[qID-1] qID-- } } @@ -494,8 +519,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - queue[qID].point = points[i] + // queue[qID].pointID = uint32(i << 1) + queue[qID].point.Set(&points[i]) } else { + // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -515,6 +542,9 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP } } + // flush items in batch. + executeAndReset() + // empty the queue flushQueue() // for qID != 0 { @@ -522,9 +552,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // executeAndReset() // } - // flush items in batch. - executeAndReset() - // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -532,9 +559,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { + if bucketSet[k] { runningSum.addMixed(&buckets[k]) } + if !bucketsJE[k].ZZ.IsZero() { + runningSum.add(&bucketsJE[k]) + } total.add(&runningSum) } diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index 50b6f180d3..902a6245f9 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -721,61 +721,61 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - abs := func(v int) int { - if v < 0 { - return -v - } - return v - } + // abs := func(v int) int { + // if v < 0 { + // return -v + // } + // return v + // } // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - var opsPerBucket [1 << 15]int // max value is 16 for c + // var opsPerBucket [1 << 15]int // max value is 16 for c // digits for the chunk chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] totalOps := 0 - nz := 0 // non zero buckets count + // nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - bucketID := digit >> 1 - if digit&1 == 0 { - bucketID -= 1 - } - if opsPerBucket[bucketID] == 0 { - nz++ - } - opsPerBucket[bucketID]++ + // bucketID := digit >> 1 + // if digit &1 == 0 { + // bucketID-=1 + // } + // if opsPerBucket[bucketID] == 0 { + // nz++ + // } + // opsPerBucket[bucketID]++ } chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - chunkStats[chunkID].nonZeroBuckets = nz - - if nz == 0 { - return // no ops, only zeroes - } - - bound := 1 << (c - 1) - if chunkID == int(nbChunks-1) { - bound = 1 << (lastC(c) - 1) - } - mean := totalOps / nz - aad := 0 - averageOpsPerBucket := 0 - for b := 0; b < bound; b++ { - if opsPerBucket[b] == 0 { - continue - } - aad += abs(opsPerBucket[b] - mean) - averageOpsPerBucket += opsPerBucket[b] - } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + // chunkStats[chunkID].nonZeroBuckets = nz + + // if nz == 0 { + // return // no ops, only zeroes + // } + + // bound := 1 << (c-1) + // if chunkID == int(nbChunks-1) { + // bound = 1 << (lastC(c)-1) + // } + // mean := totalOps / nz + // aad := 0 + // averageOpsPerBucket := 0 + // for b:=0; b < bound; b++ { + // if opsPerBucket[b] == 0 { + // continue + // } + // aad += abs(opsPerBucket[b] - mean) + // averageOpsPerBucket += opsPerBucket[b] + // } + // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + // chunkStats[chunkID].deviation = aad / nz } }, nbTasks) diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index 61ebeeebb0..992efc3fa4 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -23,11 +23,13 @@ import ( type batchOpG1Affine struct { bucketID uint16 - point G1Affine + // pointID uint32 + point G1Affine } func (o batchOpG1Affine) isNeg() bool { - return o.bucketID&1 == 1 + return false + // return o.pointID&1 == 1 } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -46,6 +48,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // init the buckets var buckets B var bucketsJE BJE + var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -60,6 +63,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) + // var queue [batchSize]batchOpG1Affine batchSize := len(P) @@ -77,7 +81,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[op.bucketID] { + bucketSet[op.bucketID] = true BK.Set(&op.point) return } @@ -90,6 +95,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP return } BK.setInfinity() + bucketSet[op.bucketID] = false return } @@ -103,12 +109,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[bucketID] { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } + bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -120,12 +127,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() + bucketSet[bucketID] = false } return } if isAdd { BK.setInfinity() + bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -152,13 +161,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { - continue + return } addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] + // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) + // if isFull() { + // executeAndReset() + // } + // queue[i] = queue[qID-1] qID-- } } @@ -180,8 +190,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - queue[qID].point = points[i] + // queue[qID].pointID = uint32(i << 1) + queue[qID].point.Set(&points[i]) } else { + // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -201,6 +213,9 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP } } + // flush items in batch. + executeAndReset() + // empty the queue flushQueue() // for qID != 0 { @@ -208,9 +223,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // executeAndReset() // } - // flush items in batch. - executeAndReset() - // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -218,9 +230,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { + if bucketSet[k] { runningSum.addMixed(&buckets[k]) } + if !bucketsJE[k].ZZ.IsZero() { + runningSum.add(&bucketsJE[k]) + } total.add(&runningSum) } @@ -337,11 +352,13 @@ type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 - point G2Affine + // pointID uint32 + point G2Affine } func (o batchOpG2Affine) isNeg() bool { - return o.bucketID&1 == 1 + return false + // return o.pointID&1 == 1 } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -360,6 +377,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // init the buckets var buckets B var bucketsJE BJE + var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -374,6 +392,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) + // var queue [batchSize]batchOpG2Affine batchSize := len(P) @@ -391,7 +410,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[op.bucketID] { + bucketSet[op.bucketID] = true BK.Set(&op.point) return } @@ -404,6 +424,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP return } BK.setInfinity() + bucketSet[op.bucketID] = false return } @@ -417,12 +438,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[bucketID] { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } + bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -434,12 +456,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() + bucketSet[bucketID] = false } return } if isAdd { BK.setInfinity() + bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -466,13 +490,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { - continue + return } addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] + // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) + // if isFull() { + // executeAndReset() + // } + // queue[i] = queue[qID-1] qID-- } } @@ -494,8 +519,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - queue[qID].point = points[i] + // queue[qID].pointID = uint32(i << 1) + queue[qID].point.Set(&points[i]) } else { + // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -515,6 +542,9 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP } } + // flush items in batch. + executeAndReset() + // empty the queue flushQueue() // for qID != 0 { @@ -522,9 +552,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // executeAndReset() // } - // flush items in batch. - executeAndReset() - // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -532,9 +559,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { + if bucketSet[k] { runningSum.addMixed(&buckets[k]) } + if !bucketsJE[k].ZZ.IsZero() { + runningSum.add(&bucketsJE[k]) + } total.add(&runningSum) } diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index 3c05caa715..16a77feb26 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -721,61 +721,61 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - abs := func(v int) int { - if v < 0 { - return -v - } - return v - } + // abs := func(v int) int { + // if v < 0 { + // return -v + // } + // return v + // } // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - var opsPerBucket [1 << 15]int // max value is 16 for c + // var opsPerBucket [1 << 15]int // max value is 16 for c // digits for the chunk chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] totalOps := 0 - nz := 0 // non zero buckets count + // nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - bucketID := digit >> 1 - if digit&1 == 0 { - bucketID -= 1 - } - if opsPerBucket[bucketID] == 0 { - nz++ - } - opsPerBucket[bucketID]++ + // bucketID := digit >> 1 + // if digit &1 == 0 { + // bucketID-=1 + // } + // if opsPerBucket[bucketID] == 0 { + // nz++ + // } + // opsPerBucket[bucketID]++ } chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - chunkStats[chunkID].nonZeroBuckets = nz - - if nz == 0 { - return // no ops, only zeroes - } - - bound := 1 << (c - 1) - if chunkID == int(nbChunks-1) { - bound = 1 << (lastC(c) - 1) - } - mean := totalOps / nz - aad := 0 - averageOpsPerBucket := 0 - for b := 0; b < bound; b++ { - if opsPerBucket[b] == 0 { - continue - } - aad += abs(opsPerBucket[b] - mean) - averageOpsPerBucket += opsPerBucket[b] - } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + // chunkStats[chunkID].nonZeroBuckets = nz + + // if nz == 0 { + // return // no ops, only zeroes + // } + + // bound := 1 << (c-1) + // if chunkID == int(nbChunks-1) { + // bound = 1 << (lastC(c)-1) + // } + // mean := totalOps / nz + // aad := 0 + // averageOpsPerBucket := 0 + // for b:=0; b < bound; b++ { + // if opsPerBucket[b] == 0 { + // continue + // } + // aad += abs(opsPerBucket[b] - mean) + // averageOpsPerBucket += opsPerBucket[b] + // } + // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + // chunkStats[chunkID].deviation = aad / nz } }, nbTasks) diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index 85af6357fd..ac3db048e6 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -23,11 +23,13 @@ import ( type batchOpG1Affine struct { bucketID uint16 - point G1Affine + // pointID uint32 + point G1Affine } func (o batchOpG1Affine) isNeg() bool { - return o.bucketID&1 == 1 + return false + // return o.pointID&1 == 1 } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -46,6 +48,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // init the buckets var buckets B var bucketsJE BJE + var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -60,6 +63,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) + // var queue [batchSize]batchOpG1Affine batchSize := len(P) @@ -77,7 +81,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[op.bucketID] { + bucketSet[op.bucketID] = true BK.Set(&op.point) return } @@ -90,6 +95,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP return } BK.setInfinity() + bucketSet[op.bucketID] = false return } @@ -103,12 +109,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[bucketID] { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } + bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -120,12 +127,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() + bucketSet[bucketID] = false } return } if isAdd { BK.setInfinity() + bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -152,13 +161,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { - continue + return } addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] + // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) + // if isFull() { + // executeAndReset() + // } + // queue[i] = queue[qID-1] qID-- } } @@ -180,8 +190,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - queue[qID].point = points[i] + // queue[qID].pointID = uint32(i << 1) + queue[qID].point.Set(&points[i]) } else { + // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -201,6 +213,9 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP } } + // flush items in batch. + executeAndReset() + // empty the queue flushQueue() // for qID != 0 { @@ -208,9 +223,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // executeAndReset() // } - // flush items in batch. - executeAndReset() - // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -218,9 +230,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { + if bucketSet[k] { runningSum.addMixed(&buckets[k]) } + if !bucketsJE[k].ZZ.IsZero() { + runningSum.add(&bucketsJE[k]) + } total.add(&runningSum) } @@ -337,11 +352,13 @@ type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 - point G2Affine + // pointID uint32 + point G2Affine } func (o batchOpG2Affine) isNeg() bool { - return o.bucketID&1 == 1 + return false + // return o.pointID&1 == 1 } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -360,6 +377,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // init the buckets var buckets B var bucketsJE BJE + var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -374,6 +392,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) + // var queue [batchSize]batchOpG2Affine batchSize := len(P) @@ -391,7 +410,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[op.bucketID] { + bucketSet[op.bucketID] = true BK.Set(&op.point) return } @@ -404,6 +424,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP return } BK.setInfinity() + bucketSet[op.bucketID] = false return } @@ -417,12 +438,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[bucketID] { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } + bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -434,12 +456,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() + bucketSet[bucketID] = false } return } if isAdd { BK.setInfinity() + bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -466,13 +490,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { - continue + return } addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] + // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) + // if isFull() { + // executeAndReset() + // } + // queue[i] = queue[qID-1] qID-- } } @@ -494,8 +519,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - queue[qID].point = points[i] + // queue[qID].pointID = uint32(i << 1) + queue[qID].point.Set(&points[i]) } else { + // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -515,6 +542,9 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP } } + // flush items in batch. + executeAndReset() + // empty the queue flushQueue() // for qID != 0 { @@ -522,9 +552,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // executeAndReset() // } - // flush items in batch. - executeAndReset() - // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -532,9 +559,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { + if bucketSet[k] { runningSum.addMixed(&buckets[k]) } + if !bucketsJE[k].ZZ.IsZero() { + runningSum.add(&bucketsJE[k]) + } total.add(&runningSum) } diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index 2f80a71f5c..9c8c78a3f6 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -721,61 +721,61 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - abs := func(v int) int { - if v < 0 { - return -v - } - return v - } + // abs := func(v int) int { + // if v < 0 { + // return -v + // } + // return v + // } // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - var opsPerBucket [1 << 15]int // max value is 16 for c + // var opsPerBucket [1 << 15]int // max value is 16 for c // digits for the chunk chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] totalOps := 0 - nz := 0 // non zero buckets count + // nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - bucketID := digit >> 1 - if digit&1 == 0 { - bucketID -= 1 - } - if opsPerBucket[bucketID] == 0 { - nz++ - } - opsPerBucket[bucketID]++ + // bucketID := digit >> 1 + // if digit &1 == 0 { + // bucketID-=1 + // } + // if opsPerBucket[bucketID] == 0 { + // nz++ + // } + // opsPerBucket[bucketID]++ } chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - chunkStats[chunkID].nonZeroBuckets = nz - - if nz == 0 { - return // no ops, only zeroes - } - - bound := 1 << (c - 1) - if chunkID == int(nbChunks-1) { - bound = 1 << (lastC(c) - 1) - } - mean := totalOps / nz - aad := 0 - averageOpsPerBucket := 0 - for b := 0; b < bound; b++ { - if opsPerBucket[b] == 0 { - continue - } - aad += abs(opsPerBucket[b] - mean) - averageOpsPerBucket += opsPerBucket[b] - } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + // chunkStats[chunkID].nonZeroBuckets = nz + + // if nz == 0 { + // return // no ops, only zeroes + // } + + // bound := 1 << (c-1) + // if chunkID == int(nbChunks-1) { + // bound = 1 << (lastC(c)-1) + // } + // mean := totalOps / nz + // aad := 0 + // averageOpsPerBucket := 0 + // for b:=0; b < bound; b++ { + // if opsPerBucket[b] == 0 { + // continue + // } + // aad += abs(opsPerBucket[b] - mean) + // averageOpsPerBucket += opsPerBucket[b] + // } + // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + // chunkStats[chunkID].deviation = aad / nz } }, nbTasks) diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index a6fcb5a2f8..4e241f79a5 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -23,11 +23,13 @@ import ( type batchOpG1Affine struct { bucketID uint16 - point G1Affine + // pointID uint32 + point G1Affine } func (o batchOpG1Affine) isNeg() bool { - return o.bucketID&1 == 1 + return false + // return o.pointID&1 == 1 } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -46,6 +48,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // init the buckets var buckets B var bucketsJE BJE + var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -60,6 +63,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) + // var queue [batchSize]batchOpG1Affine batchSize := len(P) @@ -77,7 +81,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[op.bucketID] { + bucketSet[op.bucketID] = true BK.Set(&op.point) return } @@ -90,6 +95,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP return } BK.setInfinity() + bucketSet[op.bucketID] = false return } @@ -103,12 +109,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[bucketID] { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } + bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -120,12 +127,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() + bucketSet[bucketID] = false } return } if isAdd { BK.setInfinity() + bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -152,13 +161,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { - continue + return } addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] + // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) + // if isFull() { + // executeAndReset() + // } + // queue[i] = queue[qID-1] qID-- } } @@ -180,8 +190,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - queue[qID].point = points[i] + // queue[qID].pointID = uint32(i << 1) + queue[qID].point.Set(&points[i]) } else { + // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -201,6 +213,9 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP } } + // flush items in batch. + executeAndReset() + // empty the queue flushQueue() // for qID != 0 { @@ -208,9 +223,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // executeAndReset() // } - // flush items in batch. - executeAndReset() - // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -218,9 +230,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { + if bucketSet[k] { runningSum.addMixed(&buckets[k]) } + if !bucketsJE[k].ZZ.IsZero() { + runningSum.add(&bucketsJE[k]) + } total.add(&runningSum) } @@ -337,11 +352,13 @@ type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 - point G2Affine + // pointID uint32 + point G2Affine } func (o batchOpG2Affine) isNeg() bool { - return o.bucketID&1 == 1 + return false + // return o.pointID&1 == 1 } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -360,6 +377,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // init the buckets var buckets B var bucketsJE BJE + var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -374,6 +392,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) + // var queue [batchSize]batchOpG2Affine batchSize := len(P) @@ -391,7 +410,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[op.bucketID] { + bucketSet[op.bucketID] = true BK.Set(&op.point) return } @@ -404,6 +424,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP return } BK.setInfinity() + bucketSet[op.bucketID] = false return } @@ -417,12 +438,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[bucketID] { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } + bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -434,12 +456,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() + bucketSet[bucketID] = false } return } if isAdd { BK.setInfinity() + bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -466,13 +490,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { - continue + return } addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] + // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) + // if isFull() { + // executeAndReset() + // } + // queue[i] = queue[qID-1] qID-- } } @@ -494,8 +519,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - queue[qID].point = points[i] + // queue[qID].pointID = uint32(i << 1) + queue[qID].point.Set(&points[i]) } else { + // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -515,6 +542,9 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP } } + // flush items in batch. + executeAndReset() + // empty the queue flushQueue() // for qID != 0 { @@ -522,9 +552,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // executeAndReset() // } - // flush items in batch. - executeAndReset() - // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -532,9 +559,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { + if bucketSet[k] { runningSum.addMixed(&buckets[k]) } + if !bucketsJE[k].ZZ.IsZero() { + runningSum.add(&bucketsJE[k]) + } total.add(&runningSum) } diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index 0767ef87aa..b06acb685c 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -721,61 +721,61 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - abs := func(v int) int { - if v < 0 { - return -v - } - return v - } + // abs := func(v int) int { + // if v < 0 { + // return -v + // } + // return v + // } // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - var opsPerBucket [1 << 15]int // max value is 16 for c + // var opsPerBucket [1 << 15]int // max value is 16 for c // digits for the chunk chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] totalOps := 0 - nz := 0 // non zero buckets count + // nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - bucketID := digit >> 1 - if digit&1 == 0 { - bucketID -= 1 - } - if opsPerBucket[bucketID] == 0 { - nz++ - } - opsPerBucket[bucketID]++ + // bucketID := digit >> 1 + // if digit &1 == 0 { + // bucketID-=1 + // } + // if opsPerBucket[bucketID] == 0 { + // nz++ + // } + // opsPerBucket[bucketID]++ } chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - chunkStats[chunkID].nonZeroBuckets = nz - - if nz == 0 { - return // no ops, only zeroes - } - - bound := 1 << (c - 1) - if chunkID == int(nbChunks-1) { - bound = 1 << (lastC(c) - 1) - } - mean := totalOps / nz - aad := 0 - averageOpsPerBucket := 0 - for b := 0; b < bound; b++ { - if opsPerBucket[b] == 0 { - continue - } - aad += abs(opsPerBucket[b] - mean) - averageOpsPerBucket += opsPerBucket[b] - } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + // chunkStats[chunkID].nonZeroBuckets = nz + + // if nz == 0 { + // return // no ops, only zeroes + // } + + // bound := 1 << (c-1) + // if chunkID == int(nbChunks-1) { + // bound = 1 << (lastC(c)-1) + // } + // mean := totalOps / nz + // aad := 0 + // averageOpsPerBucket := 0 + // for b:=0; b < bound; b++ { + // if opsPerBucket[b] == 0 { + // continue + // } + // aad += abs(opsPerBucket[b] - mean) + // averageOpsPerBucket += opsPerBucket[b] + // } + // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + // chunkStats[chunkID].deviation = aad / nz } }, nbTasks) diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index 044f6834fb..a268d19c87 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -23,11 +23,13 @@ import ( type batchOpG1Affine struct { bucketID uint16 - point G1Affine + // pointID uint32 + point G1Affine } func (o batchOpG1Affine) isNeg() bool { - return o.bucketID&1 == 1 + return false + // return o.pointID&1 == 1 } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -46,6 +48,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // init the buckets var buckets B var bucketsJE BJE + var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -60,6 +63,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) + // var queue [batchSize]batchOpG1Affine batchSize := len(P) @@ -77,7 +81,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[op.bucketID] { + bucketSet[op.bucketID] = true BK.Set(&op.point) return } @@ -90,6 +95,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP return } BK.setInfinity() + bucketSet[op.bucketID] = false return } @@ -103,12 +109,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[bucketID] { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } + bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -120,12 +127,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() + bucketSet[bucketID] = false } return } if isAdd { BK.setInfinity() + bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -152,13 +161,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { - continue + return } addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] + // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) + // if isFull() { + // executeAndReset() + // } + // queue[i] = queue[qID-1] qID-- } } @@ -180,8 +190,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - queue[qID].point = points[i] + // queue[qID].pointID = uint32(i << 1) + queue[qID].point.Set(&points[i]) } else { + // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -201,6 +213,9 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP } } + // flush items in batch. + executeAndReset() + // empty the queue flushQueue() // for qID != 0 { @@ -208,9 +223,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // executeAndReset() // } - // flush items in batch. - executeAndReset() - // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -218,9 +230,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { + if bucketSet[k] { runningSum.addMixed(&buckets[k]) } + if !bucketsJE[k].ZZ.IsZero() { + runningSum.add(&bucketsJE[k]) + } total.add(&runningSum) } @@ -337,11 +352,13 @@ type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 - point G2Affine + // pointID uint32 + point G2Affine } func (o batchOpG2Affine) isNeg() bool { - return o.bucketID&1 == 1 + return false + // return o.pointID&1 == 1 } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -360,6 +377,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // init the buckets var buckets B var bucketsJE BJE + var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -374,6 +392,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) + // var queue [batchSize]batchOpG2Affine batchSize := len(P) @@ -391,7 +410,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[op.bucketID] { + bucketSet[op.bucketID] = true BK.Set(&op.point) return } @@ -404,6 +424,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP return } BK.setInfinity() + bucketSet[op.bucketID] = false return } @@ -417,12 +438,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[bucketID] { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } + bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -434,12 +456,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() + bucketSet[bucketID] = false } return } if isAdd { BK.setInfinity() + bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -466,13 +490,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { - continue + return } addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] + // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) + // if isFull() { + // executeAndReset() + // } + // queue[i] = queue[qID-1] qID-- } } @@ -494,8 +519,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - queue[qID].point = points[i] + // queue[qID].pointID = uint32(i << 1) + queue[qID].point.Set(&points[i]) } else { + // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -515,6 +542,9 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP } } + // flush items in batch. + executeAndReset() + // empty the queue flushQueue() // for qID != 0 { @@ -522,9 +552,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // executeAndReset() // } - // flush items in batch. - executeAndReset() - // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -532,9 +559,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { + if bucketSet[k] { runningSum.addMixed(&buckets[k]) } + if !bucketsJE[k].ZZ.IsZero() { + runningSum.add(&bucketsJE[k]) + } total.add(&runningSum) } diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index 6bc12d487a..1bc8b9e0ef 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -571,61 +571,61 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - abs := func(v int) int { - if v < 0 { - return -v - } - return v - } + // abs := func(v int) int { + // if v < 0 { + // return -v + // } + // return v + // } // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - var opsPerBucket [1 << 15]int // max value is 16 for c + // var opsPerBucket [1 << 15]int // max value is 16 for c // digits for the chunk chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] totalOps := 0 - nz := 0 // non zero buckets count + // nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - bucketID := digit >> 1 - if digit&1 == 0 { - bucketID -= 1 - } - if opsPerBucket[bucketID] == 0 { - nz++ - } - opsPerBucket[bucketID]++ + // bucketID := digit >> 1 + // if digit &1 == 0 { + // bucketID-=1 + // } + // if opsPerBucket[bucketID] == 0 { + // nz++ + // } + // opsPerBucket[bucketID]++ } chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - chunkStats[chunkID].nonZeroBuckets = nz - - if nz == 0 { - return // no ops, only zeroes - } - - bound := 1 << (c - 1) - if chunkID == int(nbChunks-1) { - bound = 1 << (lastC(c) - 1) - } - mean := totalOps / nz - aad := 0 - averageOpsPerBucket := 0 - for b := 0; b < bound; b++ { - if opsPerBucket[b] == 0 { - continue - } - aad += abs(opsPerBucket[b] - mean) - averageOpsPerBucket += opsPerBucket[b] - } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + // chunkStats[chunkID].nonZeroBuckets = nz + + // if nz == 0 { + // return // no ops, only zeroes + // } + + // bound := 1 << (c-1) + // if chunkID == int(nbChunks-1) { + // bound = 1 << (lastC(c)-1) + // } + // mean := totalOps / nz + // aad := 0 + // averageOpsPerBucket := 0 + // for b:=0; b < bound; b++ { + // if opsPerBucket[b] == 0 { + // continue + // } + // aad += abs(opsPerBucket[b] - mean) + // averageOpsPerBucket += opsPerBucket[b] + // } + // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + // chunkStats[chunkID].deviation = aad / nz } }, nbTasks) diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index 1de033659f..a10b6cbf76 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -22,11 +22,13 @@ import ( type batchOpG1Affine struct { bucketID uint16 - point G1Affine + // pointID uint32 + point G1Affine } func (o batchOpG1Affine) isNeg() bool { - return o.bucketID&1 == 1 + return false + // return o.pointID&1 == 1 } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -45,6 +47,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // init the buckets var buckets B var bucketsJE BJE + var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -59,6 +62,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) + // var queue [batchSize]batchOpG1Affine batchSize := len(P) @@ -76,7 +80,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[op.bucketID] { + bucketSet[op.bucketID] = true BK.Set(&op.point) return } @@ -89,6 +94,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP return } BK.setInfinity() + bucketSet[op.bucketID] = false return } @@ -102,12 +108,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[bucketID] { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } + bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -119,12 +126,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() + bucketSet[bucketID] = false } return } if isAdd { BK.setInfinity() + bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -151,13 +160,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { - continue + return } addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] + // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) + // if isFull() { + // executeAndReset() + // } + // queue[i] = queue[qID-1] qID-- } } @@ -179,8 +189,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - queue[qID].point = points[i] + // queue[qID].pointID = uint32(i << 1) + queue[qID].point.Set(&points[i]) } else { + // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -200,6 +212,9 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP } } + // flush items in batch. + executeAndReset() + // empty the queue flushQueue() // for qID != 0 { @@ -207,9 +222,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // executeAndReset() // } - // flush items in batch. - executeAndReset() - // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -217,9 +229,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { + if bucketSet[k] { runningSum.addMixed(&buckets[k]) } + if !bucketsJE[k].ZZ.IsZero() { + runningSum.add(&bucketsJE[k]) + } total.add(&runningSum) } @@ -264,11 +279,13 @@ type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 - point G2Affine + // pointID uint32 + point G2Affine } func (o batchOpG2Affine) isNeg() bool { - return o.bucketID&1 == 1 + return false + // return o.pointID&1 == 1 } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -287,6 +304,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // init the buckets var buckets B var bucketsJE BJE + var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -301,6 +319,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) + // var queue [batchSize]batchOpG2Affine batchSize := len(P) @@ -318,7 +337,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[op.bucketID] { + bucketSet[op.bucketID] = true BK.Set(&op.point) return } @@ -331,6 +351,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP return } BK.setInfinity() + bucketSet[op.bucketID] = false return } @@ -344,12 +365,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[bucketID] { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } + bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -361,12 +383,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() + bucketSet[bucketID] = false } return } if isAdd { BK.setInfinity() + bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -393,13 +417,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { - continue + return } addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] + // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) + // if isFull() { + // executeAndReset() + // } + // queue[i] = queue[qID-1] qID-- } } @@ -421,8 +446,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - queue[qID].point = points[i] + // queue[qID].pointID = uint32(i << 1) + queue[qID].point.Set(&points[i]) } else { + // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -442,6 +469,9 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP } } + // flush items in batch. + executeAndReset() + // empty the queue flushQueue() // for qID != 0 { @@ -449,9 +479,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // executeAndReset() // } - // flush items in batch. - executeAndReset() - // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -459,9 +486,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { + if bucketSet[k] { runningSum.addMixed(&buckets[k]) } + if !bucketsJE[k].ZZ.IsZero() { + runningSum.add(&bucketsJE[k]) + } total.add(&runningSum) } diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index 951ee4bd04..e695c8791f 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -571,61 +571,61 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - abs := func(v int) int { - if v < 0 { - return -v - } - return v - } + // abs := func(v int) int { + // if v < 0 { + // return -v + // } + // return v + // } // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - var opsPerBucket [1 << 15]int // max value is 16 for c + // var opsPerBucket [1 << 15]int // max value is 16 for c // digits for the chunk chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] totalOps := 0 - nz := 0 // non zero buckets count + // nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - bucketID := digit >> 1 - if digit&1 == 0 { - bucketID -= 1 - } - if opsPerBucket[bucketID] == 0 { - nz++ - } - opsPerBucket[bucketID]++ + // bucketID := digit >> 1 + // if digit &1 == 0 { + // bucketID-=1 + // } + // if opsPerBucket[bucketID] == 0 { + // nz++ + // } + // opsPerBucket[bucketID]++ } chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - chunkStats[chunkID].nonZeroBuckets = nz - - if nz == 0 { - return // no ops, only zeroes - } - - bound := 1 << (c - 1) - if chunkID == int(nbChunks-1) { - bound = 1 << (lastC(c) - 1) - } - mean := totalOps / nz - aad := 0 - averageOpsPerBucket := 0 - for b := 0; b < bound; b++ { - if opsPerBucket[b] == 0 { - continue - } - aad += abs(opsPerBucket[b] - mean) - averageOpsPerBucket += opsPerBucket[b] - } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + // chunkStats[chunkID].nonZeroBuckets = nz + + // if nz == 0 { + // return // no ops, only zeroes + // } + + // bound := 1 << (c-1) + // if chunkID == int(nbChunks-1) { + // bound = 1 << (lastC(c)-1) + // } + // mean := totalOps / nz + // aad := 0 + // averageOpsPerBucket := 0 + // for b:=0; b < bound; b++ { + // if opsPerBucket[b] == 0 { + // continue + // } + // aad += abs(opsPerBucket[b] - mean) + // averageOpsPerBucket += opsPerBucket[b] + // } + // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + // chunkStats[chunkID].deviation = aad / nz } }, nbTasks) diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index 5b99051b82..00e9053fd1 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -22,11 +22,13 @@ import ( type batchOpG1Affine struct { bucketID uint16 - point G1Affine + // pointID uint32 + point G1Affine } func (o batchOpG1Affine) isNeg() bool { - return o.bucketID&1 == 1 + return false + // return o.pointID&1 == 1 } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -45,6 +47,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // init the buckets var buckets B var bucketsJE BJE + var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -59,6 +62,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) + // var queue [batchSize]batchOpG1Affine batchSize := len(P) @@ -76,7 +80,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[op.bucketID] { + bucketSet[op.bucketID] = true BK.Set(&op.point) return } @@ -89,6 +94,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP return } BK.setInfinity() + bucketSet[op.bucketID] = false return } @@ -102,12 +108,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[bucketID] { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } + bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -119,12 +126,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() + bucketSet[bucketID] = false } return } if isAdd { BK.setInfinity() + bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -151,13 +160,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { - continue + return } addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] + // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) + // if isFull() { + // executeAndReset() + // } + // queue[i] = queue[qID-1] qID-- } } @@ -179,8 +189,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - queue[qID].point = points[i] + // queue[qID].pointID = uint32(i << 1) + queue[qID].point.Set(&points[i]) } else { + // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -200,6 +212,9 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP } } + // flush items in batch. + executeAndReset() + // empty the queue flushQueue() // for qID != 0 { @@ -207,9 +222,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // executeAndReset() // } - // flush items in batch. - executeAndReset() - // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -217,9 +229,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { + if bucketSet[k] { runningSum.addMixed(&buckets[k]) } + if !bucketsJE[k].ZZ.IsZero() { + runningSum.add(&bucketsJE[k]) + } total.add(&runningSum) } @@ -264,11 +279,13 @@ type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 - point G2Affine + // pointID uint32 + point G2Affine } func (o batchOpG2Affine) isNeg() bool { - return o.bucketID&1 == 1 + return false + // return o.pointID&1 == 1 } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -287,6 +304,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // init the buckets var buckets B var bucketsJE BJE + var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -301,6 +319,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) + // var queue [batchSize]batchOpG2Affine batchSize := len(P) @@ -318,7 +337,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[op.bucketID] { + bucketSet[op.bucketID] = true BK.Set(&op.point) return } @@ -331,6 +351,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP return } BK.setInfinity() + bucketSet[op.bucketID] = false return } @@ -344,12 +365,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[bucketID] { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } + bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -361,12 +383,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() + bucketSet[bucketID] = false } return } if isAdd { BK.setInfinity() + bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -393,13 +417,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { - continue + return } addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] + // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) + // if isFull() { + // executeAndReset() + // } + // queue[i] = queue[qID-1] qID-- } } @@ -421,8 +446,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - queue[qID].point = points[i] + // queue[qID].pointID = uint32(i << 1) + queue[qID].point.Set(&points[i]) } else { + // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -442,6 +469,9 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP } } + // flush items in batch. + executeAndReset() + // empty the queue flushQueue() // for qID != 0 { @@ -449,9 +479,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // executeAndReset() // } - // flush items in batch. - executeAndReset() - // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -459,9 +486,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { + if bucketSet[k] { runningSum.addMixed(&buckets[k]) } + if !bucketsJE[k].ZZ.IsZero() { + runningSum.add(&bucketsJE[k]) + } total.add(&runningSum) } diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index fe9a1970a4..b89e06e566 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -571,61 +571,61 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - abs := func(v int) int { - if v < 0 { - return -v - } - return v - } + // abs := func(v int) int { + // if v < 0 { + // return -v + // } + // return v + // } // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - var opsPerBucket [1 << 15]int // max value is 16 for c + // var opsPerBucket [1 << 15]int // max value is 16 for c // digits for the chunk chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] totalOps := 0 - nz := 0 // non zero buckets count + // nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - bucketID := digit >> 1 - if digit&1 == 0 { - bucketID -= 1 - } - if opsPerBucket[bucketID] == 0 { - nz++ - } - opsPerBucket[bucketID]++ + // bucketID := digit >> 1 + // if digit &1 == 0 { + // bucketID-=1 + // } + // if opsPerBucket[bucketID] == 0 { + // nz++ + // } + // opsPerBucket[bucketID]++ } chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - chunkStats[chunkID].nonZeroBuckets = nz - - if nz == 0 { - return // no ops, only zeroes - } - - bound := 1 << (c - 1) - if chunkID == int(nbChunks-1) { - bound = 1 << (lastC(c) - 1) - } - mean := totalOps / nz - aad := 0 - averageOpsPerBucket := 0 - for b := 0; b < bound; b++ { - if opsPerBucket[b] == 0 { - continue - } - aad += abs(opsPerBucket[b] - mean) - averageOpsPerBucket += opsPerBucket[b] - } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + // chunkStats[chunkID].nonZeroBuckets = nz + + // if nz == 0 { + // return // no ops, only zeroes + // } + + // bound := 1 << (c-1) + // if chunkID == int(nbChunks-1) { + // bound = 1 << (lastC(c)-1) + // } + // mean := totalOps / nz + // aad := 0 + // averageOpsPerBucket := 0 + // for b:=0; b < bound; b++ { + // if opsPerBucket[b] == 0 { + // continue + // } + // aad += abs(opsPerBucket[b] - mean) + // averageOpsPerBucket += opsPerBucket[b] + // } + // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + // chunkStats[chunkID].deviation = aad / nz } }, nbTasks) diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index 885062c223..13f133daa5 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -22,11 +22,13 @@ import ( type batchOpG1Affine struct { bucketID uint16 - point G1Affine + // pointID uint32 + point G1Affine } func (o batchOpG1Affine) isNeg() bool { - return o.bucketID&1 == 1 + return false + // return o.pointID&1 == 1 } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -45,6 +47,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // init the buckets var buckets B var bucketsJE BJE + var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -59,6 +62,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) + // var queue [batchSize]batchOpG1Affine batchSize := len(P) @@ -76,7 +80,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[op.bucketID] { + bucketSet[op.bucketID] = true BK.Set(&op.point) return } @@ -89,6 +94,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP return } BK.setInfinity() + bucketSet[op.bucketID] = false return } @@ -102,12 +108,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[bucketID] { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } + bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -119,12 +126,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() + bucketSet[bucketID] = false } return } if isAdd { BK.setInfinity() + bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -151,13 +160,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { - continue + return } addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] + // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) + // if isFull() { + // executeAndReset() + // } + // queue[i] = queue[qID-1] qID-- } } @@ -179,8 +189,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - queue[qID].point = points[i] + // queue[qID].pointID = uint32(i << 1) + queue[qID].point.Set(&points[i]) } else { + // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -200,6 +212,9 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP } } + // flush items in batch. + executeAndReset() + // empty the queue flushQueue() // for qID != 0 { @@ -207,9 +222,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // executeAndReset() // } - // flush items in batch. - executeAndReset() - // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -217,9 +229,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { + if bucketSet[k] { runningSum.addMixed(&buckets[k]) } + if !bucketsJE[k].ZZ.IsZero() { + runningSum.add(&bucketsJE[k]) + } total.add(&runningSum) } @@ -264,11 +279,13 @@ type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 - point G2Affine + // pointID uint32 + point G2Affine } func (o batchOpG2Affine) isNeg() bool { - return o.bucketID&1 == 1 + return false + // return o.pointID&1 == 1 } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -287,6 +304,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // init the buckets var buckets B var bucketsJE BJE + var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -301,6 +319,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) + // var queue [batchSize]batchOpG2Affine batchSize := len(P) @@ -318,7 +337,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[op.bucketID] { + bucketSet[op.bucketID] = true BK.Set(&op.point) return } @@ -331,6 +351,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP return } BK.setInfinity() + bucketSet[op.bucketID] = false return } @@ -344,12 +365,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[bucketID] { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } + bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -361,12 +383,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() + bucketSet[bucketID] = false } return } if isAdd { BK.setInfinity() + bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -393,13 +417,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP processQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { - continue + return } addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] + // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) + // if isFull() { + // executeAndReset() + // } + // queue[i] = queue[qID-1] qID-- } } @@ -421,8 +446,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - queue[qID].point = points[i] + // queue[qID].pointID = uint32(i << 1) + queue[qID].point.Set(&points[i]) } else { + // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -442,6 +469,9 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP } } + // flush items in batch. + executeAndReset() + // empty the queue flushQueue() // for qID != 0 { @@ -449,9 +479,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // executeAndReset() // } - // flush items in batch. - executeAndReset() - // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -459,9 +486,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { + if bucketSet[k] { runningSum.addMixed(&buckets[k]) } + if !bucketsJE[k].ZZ.IsZero() { + runningSum.add(&bucketsJE[k]) + } total.add(&runningSum) } diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index f650f65dbc..b9563c9a61 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -164,62 +164,62 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - abs := func(v int) int { - if v < 0 { - return -v - } - return v - } + // abs := func(v int) int { + // if v < 0 { + // return -v + // } + // return v + // } // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - var opsPerBucket [1 << 15]int // max value is 16 for c + // var opsPerBucket [1 << 15]int // max value is 16 for c // digits for the chunk chunkDigits := digits[chunkID*len(scalars):(chunkID+1)*len(scalars)] totalOps := 0 - nz := 0 // non zero buckets count + // nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - bucketID := digit >> 1 - if digit &1 == 0 { - bucketID-=1 - } - if opsPerBucket[bucketID] == 0 { - nz++ - } - opsPerBucket[bucketID]++ + // bucketID := digit >> 1 + // if digit &1 == 0 { + // bucketID-=1 + // } + // if opsPerBucket[bucketID] == 0 { + // nz++ + // } + // opsPerBucket[bucketID]++ } chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - chunkStats[chunkID].nonZeroBuckets = nz + // chunkStats[chunkID].nonZeroBuckets = nz - if nz == 0 { - return // no ops, only zeroes - } - - bound := 1 << (c-1) - if chunkID == int(nbChunks-1) { - bound = 1 << (lastC(c)-1) - } - mean := totalOps / nz - aad := 0 - averageOpsPerBucket := 0 - for b:=0; b < bound; b++ { - if opsPerBucket[b] == 0 { - continue - } - aad += abs(opsPerBucket[b] - mean) - averageOpsPerBucket += opsPerBucket[b] - } - chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - chunkStats[chunkID].deviation = aad / nz + // if nz == 0 { + // return // no ops, only zeroes + // } + + // bound := 1 << (c-1) + // if chunkID == int(nbChunks-1) { + // bound = 1 << (lastC(c)-1) + // } + // mean := totalOps / nz + // aad := 0 + // averageOpsPerBucket := 0 + // for b:=0; b < bound; b++ { + // if opsPerBucket[b] == 0 { + // continue + // } + // aad += abs(opsPerBucket[b] - mean) + // averageOpsPerBucket += opsPerBucket[b] + // } + // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz + // chunkStats[chunkID].deviation = aad / nz } }, nbTasks) diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index 1763f28292..6463bc1ede 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -28,11 +28,13 @@ import ( type batchOp{{ $.TAffine }} struct { bucketID uint16 + // pointID uint32 point {{ $.TAffine }} } func (o batchOp{{ $.TAffine }}) isNeg() bool { - return o.bucketID&1 == 1 + return false + // return o.pointID&1 == 1 } // processChunk{{ $.UPointName }}BatchAffine process a chunk of the scalars during the msm @@ -51,6 +53,7 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B // init the buckets var buckets B var bucketsJE BJE + var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -65,6 +68,7 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B queue TQ // queue of points that conflict the current batch qID int // current position in queue ) + // var queue [batchSize]batchOp{{ $.TAffine}} batchSize := len(P) @@ -83,7 +87,8 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B // @precondition: ensures bucket is not "used" in current batch BK := &buckets[op.bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[op.bucketID] { + bucketSet[op.bucketID] = true BK.Set(&op.point) return } @@ -96,6 +101,7 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B return } BK.setInfinity() + bucketSet[op.bucketID] = false return } @@ -109,12 +115,13 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if BK.IsInfinity() { + if !bucketSet[bucketID] { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } + bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -125,13 +132,15 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B if isAdd { BK.Add(BK, BK) } else { - BK.setInfinity() + BK.setInfinity() + bucketSet[bucketID] = false } return } if isAdd { BK.setInfinity() + bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -158,13 +167,14 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B processQueue := func () { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { - continue + return } addFromQueue(queue[i]) - if isFull() { - executeAndReset() - } - queue[i] = queue[qID-1] + // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) + // if isFull() { + // executeAndReset() + // } + // queue[i] = queue[qID-1] qID-- } } @@ -187,8 +197,10 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B // put it in queue queue[qID].bucketID = bucketID if isAdd { - queue[qID].point = points[i] + // queue[qID].pointID = uint32(i << 1) + queue[qID].point.Set(&points[i]) } else { + // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -208,6 +220,10 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B } } + + // flush items in batch. + executeAndReset() + // empty the queue flushQueue() // for qID != 0 { @@ -215,8 +231,6 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B // executeAndReset() // } - // flush items in batch. - executeAndReset() // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] @@ -225,9 +239,12 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if !buckets[k].IsInfinity() { + if bucketSet[k] { runningSum.addMixed(&buckets[k]) } + if !bucketsJE[k].ZZ.IsZero() { + runningSum.add(&bucketsJE[k]) + } total.add(&runningSum) } From b096408205a69b35ae735136b2280cf7582c55ba Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Wed, 16 Nov 2022 09:37:44 -0600 Subject: [PATCH 30/43] checkpoint --- ecc/bls12-377/multiexp.go | 335 +++++++----------- ecc/bls12-377/multiexp_test.go | 148 ++++---- ecc/bls12-378/multiexp.go | 335 +++++++----------- ecc/bls12-378/multiexp_test.go | 148 ++++---- ecc/bls12-381/multiexp.go | 335 +++++++----------- ecc/bls12-381/multiexp_test.go | 148 ++++---- ecc/bls24-315/multiexp.go | 335 +++++++----------- ecc/bls24-315/multiexp_test.go | 148 ++++---- ecc/bls24-317/multiexp.go | 335 +++++++----------- ecc/bls24-317/multiexp_test.go | 148 ++++---- ecc/bn254/multiexp.go | 335 +++++++----------- ecc/bn254/multiexp_test.go | 148 ++++---- ecc/bw6-633/multiexp.go | 137 +++---- ecc/bw6-633/multiexp_test.go | 148 ++++---- ecc/bw6-756/multiexp.go | 137 +++---- ecc/bw6-756/multiexp_test.go | 148 ++++---- ecc/bw6-761/multiexp.go | 137 +++---- ecc/bw6-761/multiexp_test.go | 148 ++++---- .../generator/ecc/template/multiexp.go.tmpl | 110 ++---- .../ecc/template/tests/multiexp.go.tmpl | 70 ++-- 20 files changed, 1657 insertions(+), 2276 deletions(-) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index bc85684256..b87523e4e0 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -18,7 +18,6 @@ package bls12377 import ( "errors" - "fmt" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" "github.com/consensys/gnark-crypto/internal/parallel" @@ -141,99 +140,79 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC5] case 6: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC6] case 7: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC7] case 8: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC8] case 9: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - // const batchSize = 80 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC10] - // } - // fmt.Printf("affine \n") + const batchSize = 80 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC10] + } return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - // const batchSize = 150 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC11] - // } - // fmt.Printf("affine \n") + const batchSize = 150 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC11] + } return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - // const batchSize = 200 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC12] - // } - // fmt.Printf("affine \n") + const batchSize = 200 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC12] + } return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - // const batchSize = 350 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC13] - // } - // fmt.Printf("affine \n") + const batchSize = 350 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC13] + } return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - // const batchSize = 400 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC14] - // } - // fmt.Printf("affine \n") + const batchSize = 400 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC14] + } return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - // const batchSize = 500 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC15] - // } - // fmt.Printf("affine \n") + const batchSize = 500 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC15] + } return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - // const batchSize = 640 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC16] - // } - // fmt.Printf("affine \n") + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: // panic("will not happen c != previous values is not generated by templates") @@ -247,8 +226,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // note that buckets is an array allocated on the stack and this is critical for performance // each go routine sends its result in chChunks[i] channel chChunks := make([]chan g1JacExtended, nbChunks) @@ -258,10 +236,8 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - // fmt.Printf("\n") - // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) + // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -419,99 +395,79 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC5] case 6: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC6] case 7: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC7] case 8: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC8] case 9: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - // const batchSize = 80 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC10] - // } - // fmt.Printf("affine \n") + const batchSize = 80 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC10] + } return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - // const batchSize = 150 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC11] - // } - // fmt.Printf("affine \n") + const batchSize = 150 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC11] + } return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - // const batchSize = 200 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC12] - // } - // fmt.Printf("affine \n") + const batchSize = 200 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC12] + } return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - // const batchSize = 350 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC13] - // } - // fmt.Printf("affine \n") + const batchSize = 350 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC13] + } return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - // const batchSize = 400 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC14] - // } - // fmt.Printf("affine \n") + const batchSize = 400 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC14] + } return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - // const batchSize = 500 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC15] - // } - // fmt.Printf("affine \n") + const batchSize = 500 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC15] + } return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - // const batchSize = 640 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC16] - // } - // fmt.Printf("affine \n") + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: // panic("will not happen c != previous values is not generated by templates") @@ -525,8 +481,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // note that buckets is an array allocated on the stack and this is critical for performance // each go routine sends its result in chChunks[i] channel chChunks := make([]chan g2JacExtended, nbChunks) @@ -536,10 +491,8 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - // fmt.Printf("\n") - // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) + // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -618,21 +571,18 @@ func lastC(c uint64) uint64 { type chunkStat struct { // relative weight of work compared to other chunks. 100.0 -> nominal weight. - weight int - - // average absolute deviation. this is meant to give a sense of statistical - // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) - deviation int + weight float32 - // count the number of buckets that are non zeroes for this chunk - nonZeroBuckets int + // // average absolute deviation. this is meant to give a sense of statistical + // // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + // deviation int - // average ops per non-zero buckets - averageOpsPerBucket int -} + // percentage of bucket filled in the window; + ppBucketFilled float32 + nbBucketFilled int -func (c *chunkStat) String() string { - return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) + // // average ops per non-zero buckets + // averageOpsPerBucket int } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits @@ -721,74 +671,49 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - // abs := func(v int) int { - // if v < 0 { - // return -v - // } - // return v - // } - // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - // var opsPerBucket [1 << 15]int // max value is 16 for c + // indicates if a bucket is hit. + var b bitSetC16 + // digits for the chunk chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] totalOps := 0 - // nz := 0 // non zero buckets count + nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - // bucketID := digit >> 1 - // if digit &1 == 0 { - // bucketID-=1 - // } - // if opsPerBucket[bucketID] == 0 { - // nz++ - // } - // opsPerBucket[bucketID]++ + bucketID := digit >> 1 + if digit&1 == 0 { + bucketID -= 1 + } + if !b[bucketID] { + nz++ + b[bucketID] = true + } } - chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - // chunkStats[chunkID].nonZeroBuckets = nz - - // if nz == 0 { - // return // no ops, only zeroes - // } - - // bound := 1 << (c-1) - // if chunkID == int(nbChunks-1) { - // bound = 1 << (lastC(c)-1) - // } - // mean := totalOps / nz - // aad := 0 - // averageOpsPerBucket := 0 - // for b:=0; b < bound; b++ { - // if opsPerBucket[b] == 0 { - // continue - // } - // aad += abs(opsPerBucket[b] - mean) - // averageOpsPerBucket += opsPerBucket[b] - // } - // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - // chunkStats[chunkID].deviation = aad / nz + chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after + chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1))) + chunkStats[chunkID].nbBucketFilled = nz } }, nbTasks) - totalOps := 0 + totalOps := float32(0.0) for _, stat := range chunkStats { totalOps += stat.weight } - target := totalOps / int(nbChunks) - if target != 0 { + target := totalOps / float32(nbChunks) + if target != 0.0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = (chunkStats[i].weight * 100) / target + chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target } } diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index 38d85e79ea..8a036b0f54 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -222,38 +222,38 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element - sampleScalarsSmallValues [nbSamples]fr.Element - sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + // sampleScalarsSmallValues [nbSamples]fr.Element + // sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - copy(sampleScalarsSmallValues[:], sampleScalars[:]) - copy(sampleScalarsRedundant[:], sampleScalars[:]) - - // this means first chunk is going to have more work to do and should be split into several go routines - for i := 0; i < len(sampleScalarsSmallValues); i++ { - if i%5 == 0 { - sampleScalarsSmallValues[i].SetZero() - sampleScalarsSmallValues[i][0] = 1 - } - } - - // bad case for batch affine because scalar distribution might look uniform - // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // to process small batches of additions to flush its queue of conflicted points. - for i := 0; i < len(sampleScalarsRedundant); i += 100 { - for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { - sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - } - } + // copy(sampleScalarsSmallValues[:],sampleScalars[:]) + // copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // // this means first chunk is going to have more work to do and should be split into several go routines + // for i:=0; i < len(sampleScalarsSmallValues);i++ { + // if i % 5 == 0 { + // sampleScalarsSmallValues[i].SetZero() + // sampleScalarsSmallValues[i][0] = 1 + // } + // } + + // // bad case for batch affine because scalar distribution might look uniform + // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // // to process small batches of additions to flush its queue of conflicted points. + // for i:=0; i < len(sampleScalarsRedundant);i+=100 { + // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + // } + // } fillBenchBasesG1(samplePoints[:]) var testPoint G1Affine - for i := 22; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -263,19 +263,19 @@ func BenchmarkMultiExpG1(b *testing.B) { } }) - b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) - } - }) + // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + // } + // }) + + // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + // } + // }) } } @@ -539,38 +539,38 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element - sampleScalarsSmallValues [nbSamples]fr.Element - sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + // sampleScalarsSmallValues [nbSamples]fr.Element + // sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - copy(sampleScalarsSmallValues[:], sampleScalars[:]) - copy(sampleScalarsRedundant[:], sampleScalars[:]) - - // this means first chunk is going to have more work to do and should be split into several go routines - for i := 0; i < len(sampleScalarsSmallValues); i++ { - if i%5 == 0 { - sampleScalarsSmallValues[i].SetZero() - sampleScalarsSmallValues[i][0] = 1 - } - } - - // bad case for batch affine because scalar distribution might look uniform - // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // to process small batches of additions to flush its queue of conflicted points. - for i := 0; i < len(sampleScalarsRedundant); i += 100 { - for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { - sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - } - } + // copy(sampleScalarsSmallValues[:],sampleScalars[:]) + // copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // // this means first chunk is going to have more work to do and should be split into several go routines + // for i:=0; i < len(sampleScalarsSmallValues);i++ { + // if i % 5 == 0 { + // sampleScalarsSmallValues[i].SetZero() + // sampleScalarsSmallValues[i][0] = 1 + // } + // } + + // // bad case for batch affine because scalar distribution might look uniform + // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // // to process small batches of additions to flush its queue of conflicted points. + // for i:=0; i < len(sampleScalarsRedundant);i+=100 { + // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + // } + // } fillBenchBasesG2(samplePoints[:]) var testPoint G2Affine - for i := 22; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -580,19 +580,19 @@ func BenchmarkMultiExpG2(b *testing.B) { } }) - b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) - } - }) + // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + // } + // }) + + // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + // } + // }) } } diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index 7164359d9f..702015ccb8 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -18,7 +18,6 @@ package bls12378 import ( "errors" - "fmt" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-378/fr" "github.com/consensys/gnark-crypto/internal/parallel" @@ -141,99 +140,79 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC5] case 6: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC6] case 7: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC7] case 8: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC8] case 9: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - // const batchSize = 80 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC10] - // } - // fmt.Printf("affine \n") + const batchSize = 80 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC10] + } return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - // const batchSize = 150 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC11] - // } - // fmt.Printf("affine \n") + const batchSize = 150 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC11] + } return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - // const batchSize = 200 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC12] - // } - // fmt.Printf("affine \n") + const batchSize = 200 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC12] + } return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - // const batchSize = 350 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC13] - // } - // fmt.Printf("affine \n") + const batchSize = 350 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC13] + } return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - // const batchSize = 400 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC14] - // } - // fmt.Printf("affine \n") + const batchSize = 400 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC14] + } return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - // const batchSize = 500 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC15] - // } - // fmt.Printf("affine \n") + const batchSize = 500 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC15] + } return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - // const batchSize = 640 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC16] - // } - // fmt.Printf("affine \n") + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: // panic("will not happen c != previous values is not generated by templates") @@ -247,8 +226,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // note that buckets is an array allocated on the stack and this is critical for performance // each go routine sends its result in chChunks[i] channel chChunks := make([]chan g1JacExtended, nbChunks) @@ -258,10 +236,8 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - // fmt.Printf("\n") - // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) + // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -419,99 +395,79 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC5] case 6: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC6] case 7: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC7] case 8: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC8] case 9: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - // const batchSize = 80 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC10] - // } - // fmt.Printf("affine \n") + const batchSize = 80 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC10] + } return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - // const batchSize = 150 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC11] - // } - // fmt.Printf("affine \n") + const batchSize = 150 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC11] + } return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - // const batchSize = 200 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC12] - // } - // fmt.Printf("affine \n") + const batchSize = 200 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC12] + } return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - // const batchSize = 350 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC13] - // } - // fmt.Printf("affine \n") + const batchSize = 350 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC13] + } return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - // const batchSize = 400 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC14] - // } - // fmt.Printf("affine \n") + const batchSize = 400 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC14] + } return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - // const batchSize = 500 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC15] - // } - // fmt.Printf("affine \n") + const batchSize = 500 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC15] + } return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - // const batchSize = 640 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC16] - // } - // fmt.Printf("affine \n") + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: // panic("will not happen c != previous values is not generated by templates") @@ -525,8 +481,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // note that buckets is an array allocated on the stack and this is critical for performance // each go routine sends its result in chChunks[i] channel chChunks := make([]chan g2JacExtended, nbChunks) @@ -536,10 +491,8 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - // fmt.Printf("\n") - // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) + // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -618,21 +571,18 @@ func lastC(c uint64) uint64 { type chunkStat struct { // relative weight of work compared to other chunks. 100.0 -> nominal weight. - weight int - - // average absolute deviation. this is meant to give a sense of statistical - // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) - deviation int + weight float32 - // count the number of buckets that are non zeroes for this chunk - nonZeroBuckets int + // // average absolute deviation. this is meant to give a sense of statistical + // // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + // deviation int - // average ops per non-zero buckets - averageOpsPerBucket int -} + // percentage of bucket filled in the window; + ppBucketFilled float32 + nbBucketFilled int -func (c *chunkStat) String() string { - return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) + // // average ops per non-zero buckets + // averageOpsPerBucket int } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits @@ -721,74 +671,49 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - // abs := func(v int) int { - // if v < 0 { - // return -v - // } - // return v - // } - // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - // var opsPerBucket [1 << 15]int // max value is 16 for c + // indicates if a bucket is hit. + var b bitSetC16 + // digits for the chunk chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] totalOps := 0 - // nz := 0 // non zero buckets count + nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - // bucketID := digit >> 1 - // if digit &1 == 0 { - // bucketID-=1 - // } - // if opsPerBucket[bucketID] == 0 { - // nz++ - // } - // opsPerBucket[bucketID]++ + bucketID := digit >> 1 + if digit&1 == 0 { + bucketID -= 1 + } + if !b[bucketID] { + nz++ + b[bucketID] = true + } } - chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - // chunkStats[chunkID].nonZeroBuckets = nz - - // if nz == 0 { - // return // no ops, only zeroes - // } - - // bound := 1 << (c-1) - // if chunkID == int(nbChunks-1) { - // bound = 1 << (lastC(c)-1) - // } - // mean := totalOps / nz - // aad := 0 - // averageOpsPerBucket := 0 - // for b:=0; b < bound; b++ { - // if opsPerBucket[b] == 0 { - // continue - // } - // aad += abs(opsPerBucket[b] - mean) - // averageOpsPerBucket += opsPerBucket[b] - // } - // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - // chunkStats[chunkID].deviation = aad / nz + chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after + chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1))) + chunkStats[chunkID].nbBucketFilled = nz } }, nbTasks) - totalOps := 0 + totalOps := float32(0.0) for _, stat := range chunkStats { totalOps += stat.weight } - target := totalOps / int(nbChunks) - if target != 0 { + target := totalOps / float32(nbChunks) + if target != 0.0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = (chunkStats[i].weight * 100) / target + chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target } } diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index 846eab44a1..1e9ff1e4de 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -222,38 +222,38 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element - sampleScalarsSmallValues [nbSamples]fr.Element - sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + // sampleScalarsSmallValues [nbSamples]fr.Element + // sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - copy(sampleScalarsSmallValues[:], sampleScalars[:]) - copy(sampleScalarsRedundant[:], sampleScalars[:]) - - // this means first chunk is going to have more work to do and should be split into several go routines - for i := 0; i < len(sampleScalarsSmallValues); i++ { - if i%5 == 0 { - sampleScalarsSmallValues[i].SetZero() - sampleScalarsSmallValues[i][0] = 1 - } - } - - // bad case for batch affine because scalar distribution might look uniform - // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // to process small batches of additions to flush its queue of conflicted points. - for i := 0; i < len(sampleScalarsRedundant); i += 100 { - for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { - sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - } - } + // copy(sampleScalarsSmallValues[:],sampleScalars[:]) + // copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // // this means first chunk is going to have more work to do and should be split into several go routines + // for i:=0; i < len(sampleScalarsSmallValues);i++ { + // if i % 5 == 0 { + // sampleScalarsSmallValues[i].SetZero() + // sampleScalarsSmallValues[i][0] = 1 + // } + // } + + // // bad case for batch affine because scalar distribution might look uniform + // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // // to process small batches of additions to flush its queue of conflicted points. + // for i:=0; i < len(sampleScalarsRedundant);i+=100 { + // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + // } + // } fillBenchBasesG1(samplePoints[:]) var testPoint G1Affine - for i := 22; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -263,19 +263,19 @@ func BenchmarkMultiExpG1(b *testing.B) { } }) - b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) - } - }) + // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + // } + // }) + + // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + // } + // }) } } @@ -539,38 +539,38 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element - sampleScalarsSmallValues [nbSamples]fr.Element - sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + // sampleScalarsSmallValues [nbSamples]fr.Element + // sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - copy(sampleScalarsSmallValues[:], sampleScalars[:]) - copy(sampleScalarsRedundant[:], sampleScalars[:]) - - // this means first chunk is going to have more work to do and should be split into several go routines - for i := 0; i < len(sampleScalarsSmallValues); i++ { - if i%5 == 0 { - sampleScalarsSmallValues[i].SetZero() - sampleScalarsSmallValues[i][0] = 1 - } - } - - // bad case for batch affine because scalar distribution might look uniform - // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // to process small batches of additions to flush its queue of conflicted points. - for i := 0; i < len(sampleScalarsRedundant); i += 100 { - for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { - sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - } - } + // copy(sampleScalarsSmallValues[:],sampleScalars[:]) + // copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // // this means first chunk is going to have more work to do and should be split into several go routines + // for i:=0; i < len(sampleScalarsSmallValues);i++ { + // if i % 5 == 0 { + // sampleScalarsSmallValues[i].SetZero() + // sampleScalarsSmallValues[i][0] = 1 + // } + // } + + // // bad case for batch affine because scalar distribution might look uniform + // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // // to process small batches of additions to flush its queue of conflicted points. + // for i:=0; i < len(sampleScalarsRedundant);i+=100 { + // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + // } + // } fillBenchBasesG2(samplePoints[:]) var testPoint G2Affine - for i := 22; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -580,19 +580,19 @@ func BenchmarkMultiExpG2(b *testing.B) { } }) - b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) - } - }) + // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + // } + // }) + + // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + // } + // }) } } diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index 902a6245f9..91e471a850 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -18,7 +18,6 @@ package bls12381 import ( "errors" - "fmt" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-381/fr" "github.com/consensys/gnark-crypto/internal/parallel" @@ -141,99 +140,79 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC5] case 6: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC6] case 7: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC7] case 8: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC8] case 9: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - // const batchSize = 80 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC10] - // } - // fmt.Printf("affine \n") + const batchSize = 80 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC10] + } return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - // const batchSize = 150 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC11] - // } - // fmt.Printf("affine \n") + const batchSize = 150 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC11] + } return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - // const batchSize = 200 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC12] - // } - // fmt.Printf("affine \n") + const batchSize = 200 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC12] + } return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - // const batchSize = 350 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC13] - // } - // fmt.Printf("affine \n") + const batchSize = 350 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC13] + } return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - // const batchSize = 400 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC14] - // } - // fmt.Printf("affine \n") + const batchSize = 400 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC14] + } return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - // const batchSize = 500 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC15] - // } - // fmt.Printf("affine \n") + const batchSize = 500 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC15] + } return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - // const batchSize = 640 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC16] - // } - // fmt.Printf("affine \n") + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: // panic("will not happen c != previous values is not generated by templates") @@ -247,8 +226,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // note that buckets is an array allocated on the stack and this is critical for performance // each go routine sends its result in chChunks[i] channel chChunks := make([]chan g1JacExtended, nbChunks) @@ -258,10 +236,8 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - // fmt.Printf("\n") - // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) + // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -419,99 +395,79 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC5] case 6: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC6] case 7: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC7] case 8: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC8] case 9: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - // const batchSize = 80 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC10] - // } - // fmt.Printf("affine \n") + const batchSize = 80 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC10] + } return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - // const batchSize = 150 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC11] - // } - // fmt.Printf("affine \n") + const batchSize = 150 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC11] + } return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - // const batchSize = 200 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC12] - // } - // fmt.Printf("affine \n") + const batchSize = 200 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC12] + } return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - // const batchSize = 350 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC13] - // } - // fmt.Printf("affine \n") + const batchSize = 350 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC13] + } return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - // const batchSize = 400 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC14] - // } - // fmt.Printf("affine \n") + const batchSize = 400 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC14] + } return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - // const batchSize = 500 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC15] - // } - // fmt.Printf("affine \n") + const batchSize = 500 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC15] + } return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - // const batchSize = 640 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC16] - // } - // fmt.Printf("affine \n") + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: // panic("will not happen c != previous values is not generated by templates") @@ -525,8 +481,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // note that buckets is an array allocated on the stack and this is critical for performance // each go routine sends its result in chChunks[i] channel chChunks := make([]chan g2JacExtended, nbChunks) @@ -536,10 +491,8 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - // fmt.Printf("\n") - // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) + // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -618,21 +571,18 @@ func lastC(c uint64) uint64 { type chunkStat struct { // relative weight of work compared to other chunks. 100.0 -> nominal weight. - weight int - - // average absolute deviation. this is meant to give a sense of statistical - // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) - deviation int + weight float32 - // count the number of buckets that are non zeroes for this chunk - nonZeroBuckets int + // // average absolute deviation. this is meant to give a sense of statistical + // // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + // deviation int - // average ops per non-zero buckets - averageOpsPerBucket int -} + // percentage of bucket filled in the window; + ppBucketFilled float32 + nbBucketFilled int -func (c *chunkStat) String() string { - return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) + // // average ops per non-zero buckets + // averageOpsPerBucket int } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits @@ -721,74 +671,49 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - // abs := func(v int) int { - // if v < 0 { - // return -v - // } - // return v - // } - // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - // var opsPerBucket [1 << 15]int // max value is 16 for c + // indicates if a bucket is hit. + var b bitSetC16 + // digits for the chunk chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] totalOps := 0 - // nz := 0 // non zero buckets count + nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - // bucketID := digit >> 1 - // if digit &1 == 0 { - // bucketID-=1 - // } - // if opsPerBucket[bucketID] == 0 { - // nz++ - // } - // opsPerBucket[bucketID]++ + bucketID := digit >> 1 + if digit&1 == 0 { + bucketID -= 1 + } + if !b[bucketID] { + nz++ + b[bucketID] = true + } } - chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - // chunkStats[chunkID].nonZeroBuckets = nz - - // if nz == 0 { - // return // no ops, only zeroes - // } - - // bound := 1 << (c-1) - // if chunkID == int(nbChunks-1) { - // bound = 1 << (lastC(c)-1) - // } - // mean := totalOps / nz - // aad := 0 - // averageOpsPerBucket := 0 - // for b:=0; b < bound; b++ { - // if opsPerBucket[b] == 0 { - // continue - // } - // aad += abs(opsPerBucket[b] - mean) - // averageOpsPerBucket += opsPerBucket[b] - // } - // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - // chunkStats[chunkID].deviation = aad / nz + chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after + chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1))) + chunkStats[chunkID].nbBucketFilled = nz } }, nbTasks) - totalOps := 0 + totalOps := float32(0.0) for _, stat := range chunkStats { totalOps += stat.weight } - target := totalOps / int(nbChunks) - if target != 0 { + target := totalOps / float32(nbChunks) + if target != 0.0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = (chunkStats[i].weight * 100) / target + chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target } } diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index 65e74e0491..bb1a3ac61e 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -222,38 +222,38 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element - sampleScalarsSmallValues [nbSamples]fr.Element - sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + // sampleScalarsSmallValues [nbSamples]fr.Element + // sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - copy(sampleScalarsSmallValues[:], sampleScalars[:]) - copy(sampleScalarsRedundant[:], sampleScalars[:]) - - // this means first chunk is going to have more work to do and should be split into several go routines - for i := 0; i < len(sampleScalarsSmallValues); i++ { - if i%5 == 0 { - sampleScalarsSmallValues[i].SetZero() - sampleScalarsSmallValues[i][0] = 1 - } - } - - // bad case for batch affine because scalar distribution might look uniform - // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // to process small batches of additions to flush its queue of conflicted points. - for i := 0; i < len(sampleScalarsRedundant); i += 100 { - for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { - sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - } - } + // copy(sampleScalarsSmallValues[:],sampleScalars[:]) + // copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // // this means first chunk is going to have more work to do and should be split into several go routines + // for i:=0; i < len(sampleScalarsSmallValues);i++ { + // if i % 5 == 0 { + // sampleScalarsSmallValues[i].SetZero() + // sampleScalarsSmallValues[i][0] = 1 + // } + // } + + // // bad case for batch affine because scalar distribution might look uniform + // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // // to process small batches of additions to flush its queue of conflicted points. + // for i:=0; i < len(sampleScalarsRedundant);i+=100 { + // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + // } + // } fillBenchBasesG1(samplePoints[:]) var testPoint G1Affine - for i := 22; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -263,19 +263,19 @@ func BenchmarkMultiExpG1(b *testing.B) { } }) - b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) - } - }) + // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + // } + // }) + + // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + // } + // }) } } @@ -539,38 +539,38 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element - sampleScalarsSmallValues [nbSamples]fr.Element - sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + // sampleScalarsSmallValues [nbSamples]fr.Element + // sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - copy(sampleScalarsSmallValues[:], sampleScalars[:]) - copy(sampleScalarsRedundant[:], sampleScalars[:]) - - // this means first chunk is going to have more work to do and should be split into several go routines - for i := 0; i < len(sampleScalarsSmallValues); i++ { - if i%5 == 0 { - sampleScalarsSmallValues[i].SetZero() - sampleScalarsSmallValues[i][0] = 1 - } - } - - // bad case for batch affine because scalar distribution might look uniform - // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // to process small batches of additions to flush its queue of conflicted points. - for i := 0; i < len(sampleScalarsRedundant); i += 100 { - for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { - sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - } - } + // copy(sampleScalarsSmallValues[:],sampleScalars[:]) + // copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // // this means first chunk is going to have more work to do and should be split into several go routines + // for i:=0; i < len(sampleScalarsSmallValues);i++ { + // if i % 5 == 0 { + // sampleScalarsSmallValues[i].SetZero() + // sampleScalarsSmallValues[i][0] = 1 + // } + // } + + // // bad case for batch affine because scalar distribution might look uniform + // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // // to process small batches of additions to flush its queue of conflicted points. + // for i:=0; i < len(sampleScalarsRedundant);i+=100 { + // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + // } + // } fillBenchBasesG2(samplePoints[:]) var testPoint G2Affine - for i := 22; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -580,19 +580,19 @@ func BenchmarkMultiExpG2(b *testing.B) { } }) - b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) - } - }) + // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + // } + // }) + + // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + // } + // }) } } diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index 16a77feb26..46a8c8bd4f 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -18,7 +18,6 @@ package bls24315 import ( "errors" - "fmt" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls24-315/fr" "github.com/consensys/gnark-crypto/internal/parallel" @@ -141,99 +140,79 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC5] case 6: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC6] case 7: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC7] case 8: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC8] case 9: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - // const batchSize = 80 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC10] - // } - // fmt.Printf("affine \n") + const batchSize = 80 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC10] + } return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - // const batchSize = 150 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC11] - // } - // fmt.Printf("affine \n") + const batchSize = 150 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC11] + } return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - // const batchSize = 200 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC12] - // } - // fmt.Printf("affine \n") + const batchSize = 200 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC12] + } return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - // const batchSize = 350 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC13] - // } - // fmt.Printf("affine \n") + const batchSize = 350 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC13] + } return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - // const batchSize = 400 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC14] - // } - // fmt.Printf("affine \n") + const batchSize = 400 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC14] + } return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - // const batchSize = 500 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC15] - // } - // fmt.Printf("affine \n") + const batchSize = 500 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC15] + } return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - // const batchSize = 640 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC16] - // } - // fmt.Printf("affine \n") + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: // panic("will not happen c != previous values is not generated by templates") @@ -247,8 +226,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // note that buckets is an array allocated on the stack and this is critical for performance // each go routine sends its result in chChunks[i] channel chChunks := make([]chan g1JacExtended, nbChunks) @@ -258,10 +236,8 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - // fmt.Printf("\n") - // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) + // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -419,99 +395,79 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC5] case 6: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC6] case 7: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC7] case 8: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC8] case 9: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - // const batchSize = 80 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC10] - // } - // fmt.Printf("affine \n") + const batchSize = 80 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC10] + } return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - // const batchSize = 150 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC11] - // } - // fmt.Printf("affine \n") + const batchSize = 150 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC11] + } return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - // const batchSize = 200 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC12] - // } - // fmt.Printf("affine \n") + const batchSize = 200 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC12] + } return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - // const batchSize = 350 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC13] - // } - // fmt.Printf("affine \n") + const batchSize = 350 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC13] + } return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - // const batchSize = 400 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC14] - // } - // fmt.Printf("affine \n") + const batchSize = 400 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC14] + } return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - // const batchSize = 500 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC15] - // } - // fmt.Printf("affine \n") + const batchSize = 500 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC15] + } return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - // const batchSize = 640 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC16] - // } - // fmt.Printf("affine \n") + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: // panic("will not happen c != previous values is not generated by templates") @@ -525,8 +481,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // note that buckets is an array allocated on the stack and this is critical for performance // each go routine sends its result in chChunks[i] channel chChunks := make([]chan g2JacExtended, nbChunks) @@ -536,10 +491,8 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - // fmt.Printf("\n") - // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) + // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -618,21 +571,18 @@ func lastC(c uint64) uint64 { type chunkStat struct { // relative weight of work compared to other chunks. 100.0 -> nominal weight. - weight int - - // average absolute deviation. this is meant to give a sense of statistical - // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) - deviation int + weight float32 - // count the number of buckets that are non zeroes for this chunk - nonZeroBuckets int + // // average absolute deviation. this is meant to give a sense of statistical + // // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + // deviation int - // average ops per non-zero buckets - averageOpsPerBucket int -} + // percentage of bucket filled in the window; + ppBucketFilled float32 + nbBucketFilled int -func (c *chunkStat) String() string { - return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) + // // average ops per non-zero buckets + // averageOpsPerBucket int } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits @@ -721,74 +671,49 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - // abs := func(v int) int { - // if v < 0 { - // return -v - // } - // return v - // } - // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - // var opsPerBucket [1 << 15]int // max value is 16 for c + // indicates if a bucket is hit. + var b bitSetC16 + // digits for the chunk chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] totalOps := 0 - // nz := 0 // non zero buckets count + nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - // bucketID := digit >> 1 - // if digit &1 == 0 { - // bucketID-=1 - // } - // if opsPerBucket[bucketID] == 0 { - // nz++ - // } - // opsPerBucket[bucketID]++ + bucketID := digit >> 1 + if digit&1 == 0 { + bucketID -= 1 + } + if !b[bucketID] { + nz++ + b[bucketID] = true + } } - chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - // chunkStats[chunkID].nonZeroBuckets = nz - - // if nz == 0 { - // return // no ops, only zeroes - // } - - // bound := 1 << (c-1) - // if chunkID == int(nbChunks-1) { - // bound = 1 << (lastC(c)-1) - // } - // mean := totalOps / nz - // aad := 0 - // averageOpsPerBucket := 0 - // for b:=0; b < bound; b++ { - // if opsPerBucket[b] == 0 { - // continue - // } - // aad += abs(opsPerBucket[b] - mean) - // averageOpsPerBucket += opsPerBucket[b] - // } - // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - // chunkStats[chunkID].deviation = aad / nz + chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after + chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1))) + chunkStats[chunkID].nbBucketFilled = nz } }, nbTasks) - totalOps := 0 + totalOps := float32(0.0) for _, stat := range chunkStats { totalOps += stat.weight } - target := totalOps / int(nbChunks) - if target != 0 { + target := totalOps / float32(nbChunks) + if target != 0.0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = (chunkStats[i].weight * 100) / target + chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target } } diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index 1eb2ff3e0f..f44d8d7b81 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -222,38 +222,38 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element - sampleScalarsSmallValues [nbSamples]fr.Element - sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + // sampleScalarsSmallValues [nbSamples]fr.Element + // sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - copy(sampleScalarsSmallValues[:], sampleScalars[:]) - copy(sampleScalarsRedundant[:], sampleScalars[:]) - - // this means first chunk is going to have more work to do and should be split into several go routines - for i := 0; i < len(sampleScalarsSmallValues); i++ { - if i%5 == 0 { - sampleScalarsSmallValues[i].SetZero() - sampleScalarsSmallValues[i][0] = 1 - } - } - - // bad case for batch affine because scalar distribution might look uniform - // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // to process small batches of additions to flush its queue of conflicted points. - for i := 0; i < len(sampleScalarsRedundant); i += 100 { - for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { - sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - } - } + // copy(sampleScalarsSmallValues[:],sampleScalars[:]) + // copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // // this means first chunk is going to have more work to do and should be split into several go routines + // for i:=0; i < len(sampleScalarsSmallValues);i++ { + // if i % 5 == 0 { + // sampleScalarsSmallValues[i].SetZero() + // sampleScalarsSmallValues[i][0] = 1 + // } + // } + + // // bad case for batch affine because scalar distribution might look uniform + // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // // to process small batches of additions to flush its queue of conflicted points. + // for i:=0; i < len(sampleScalarsRedundant);i+=100 { + // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + // } + // } fillBenchBasesG1(samplePoints[:]) var testPoint G1Affine - for i := 22; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -263,19 +263,19 @@ func BenchmarkMultiExpG1(b *testing.B) { } }) - b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) - } - }) + // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + // } + // }) + + // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + // } + // }) } } @@ -539,38 +539,38 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element - sampleScalarsSmallValues [nbSamples]fr.Element - sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + // sampleScalarsSmallValues [nbSamples]fr.Element + // sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - copy(sampleScalarsSmallValues[:], sampleScalars[:]) - copy(sampleScalarsRedundant[:], sampleScalars[:]) - - // this means first chunk is going to have more work to do and should be split into several go routines - for i := 0; i < len(sampleScalarsSmallValues); i++ { - if i%5 == 0 { - sampleScalarsSmallValues[i].SetZero() - sampleScalarsSmallValues[i][0] = 1 - } - } - - // bad case for batch affine because scalar distribution might look uniform - // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // to process small batches of additions to flush its queue of conflicted points. - for i := 0; i < len(sampleScalarsRedundant); i += 100 { - for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { - sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - } - } + // copy(sampleScalarsSmallValues[:],sampleScalars[:]) + // copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // // this means first chunk is going to have more work to do and should be split into several go routines + // for i:=0; i < len(sampleScalarsSmallValues);i++ { + // if i % 5 == 0 { + // sampleScalarsSmallValues[i].SetZero() + // sampleScalarsSmallValues[i][0] = 1 + // } + // } + + // // bad case for batch affine because scalar distribution might look uniform + // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // // to process small batches of additions to flush its queue of conflicted points. + // for i:=0; i < len(sampleScalarsRedundant);i+=100 { + // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + // } + // } fillBenchBasesG2(samplePoints[:]) var testPoint G2Affine - for i := 22; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -580,19 +580,19 @@ func BenchmarkMultiExpG2(b *testing.B) { } }) - b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) - } - }) + // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + // } + // }) + + // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + // } + // }) } } diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index 9c8c78a3f6..d5232436d2 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -18,7 +18,6 @@ package bls24317 import ( "errors" - "fmt" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls24-317/fr" "github.com/consensys/gnark-crypto/internal/parallel" @@ -141,99 +140,79 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC5] case 6: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC6] case 7: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC7] case 8: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC8] case 9: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - // const batchSize = 80 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC10] - // } - // fmt.Printf("affine \n") + const batchSize = 80 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC10] + } return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - // const batchSize = 150 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC11] - // } - // fmt.Printf("affine \n") + const batchSize = 150 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC11] + } return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - // const batchSize = 200 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC12] - // } - // fmt.Printf("affine \n") + const batchSize = 200 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC12] + } return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - // const batchSize = 350 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC13] - // } - // fmt.Printf("affine \n") + const batchSize = 350 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC13] + } return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - // const batchSize = 400 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC14] - // } - // fmt.Printf("affine \n") + const batchSize = 400 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC14] + } return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - // const batchSize = 500 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC15] - // } - // fmt.Printf("affine \n") + const batchSize = 500 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC15] + } return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - // const batchSize = 640 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC16] - // } - // fmt.Printf("affine \n") + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: // panic("will not happen c != previous values is not generated by templates") @@ -247,8 +226,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // note that buckets is an array allocated on the stack and this is critical for performance // each go routine sends its result in chChunks[i] channel chChunks := make([]chan g1JacExtended, nbChunks) @@ -258,10 +236,8 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - // fmt.Printf("\n") - // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) + // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -419,99 +395,79 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC5] case 6: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC6] case 7: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC7] case 8: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC8] case 9: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - // const batchSize = 80 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC10] - // } - // fmt.Printf("affine \n") + const batchSize = 80 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC10] + } return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - // const batchSize = 150 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC11] - // } - // fmt.Printf("affine \n") + const batchSize = 150 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC11] + } return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - // const batchSize = 200 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC12] - // } - // fmt.Printf("affine \n") + const batchSize = 200 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC12] + } return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - // const batchSize = 350 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC13] - // } - // fmt.Printf("affine \n") + const batchSize = 350 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC13] + } return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - // const batchSize = 400 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC14] - // } - // fmt.Printf("affine \n") + const batchSize = 400 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC14] + } return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - // const batchSize = 500 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC15] - // } - // fmt.Printf("affine \n") + const batchSize = 500 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC15] + } return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - // const batchSize = 640 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC16] - // } - // fmt.Printf("affine \n") + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: // panic("will not happen c != previous values is not generated by templates") @@ -525,8 +481,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // note that buckets is an array allocated on the stack and this is critical for performance // each go routine sends its result in chChunks[i] channel chChunks := make([]chan g2JacExtended, nbChunks) @@ -536,10 +491,8 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - // fmt.Printf("\n") - // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) + // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -618,21 +571,18 @@ func lastC(c uint64) uint64 { type chunkStat struct { // relative weight of work compared to other chunks. 100.0 -> nominal weight. - weight int - - // average absolute deviation. this is meant to give a sense of statistical - // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) - deviation int + weight float32 - // count the number of buckets that are non zeroes for this chunk - nonZeroBuckets int + // // average absolute deviation. this is meant to give a sense of statistical + // // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + // deviation int - // average ops per non-zero buckets - averageOpsPerBucket int -} + // percentage of bucket filled in the window; + ppBucketFilled float32 + nbBucketFilled int -func (c *chunkStat) String() string { - return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) + // // average ops per non-zero buckets + // averageOpsPerBucket int } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits @@ -721,74 +671,49 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - // abs := func(v int) int { - // if v < 0 { - // return -v - // } - // return v - // } - // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - // var opsPerBucket [1 << 15]int // max value is 16 for c + // indicates if a bucket is hit. + var b bitSetC16 + // digits for the chunk chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] totalOps := 0 - // nz := 0 // non zero buckets count + nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - // bucketID := digit >> 1 - // if digit &1 == 0 { - // bucketID-=1 - // } - // if opsPerBucket[bucketID] == 0 { - // nz++ - // } - // opsPerBucket[bucketID]++ + bucketID := digit >> 1 + if digit&1 == 0 { + bucketID -= 1 + } + if !b[bucketID] { + nz++ + b[bucketID] = true + } } - chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - // chunkStats[chunkID].nonZeroBuckets = nz - - // if nz == 0 { - // return // no ops, only zeroes - // } - - // bound := 1 << (c-1) - // if chunkID == int(nbChunks-1) { - // bound = 1 << (lastC(c)-1) - // } - // mean := totalOps / nz - // aad := 0 - // averageOpsPerBucket := 0 - // for b:=0; b < bound; b++ { - // if opsPerBucket[b] == 0 { - // continue - // } - // aad += abs(opsPerBucket[b] - mean) - // averageOpsPerBucket += opsPerBucket[b] - // } - // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - // chunkStats[chunkID].deviation = aad / nz + chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after + chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1))) + chunkStats[chunkID].nbBucketFilled = nz } }, nbTasks) - totalOps := 0 + totalOps := float32(0.0) for _, stat := range chunkStats { totalOps += stat.weight } - target := totalOps / int(nbChunks) - if target != 0 { + target := totalOps / float32(nbChunks) + if target != 0.0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = (chunkStats[i].weight * 100) / target + chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target } } diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index 14af45e5b5..b3faa9e76f 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -222,38 +222,38 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element - sampleScalarsSmallValues [nbSamples]fr.Element - sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + // sampleScalarsSmallValues [nbSamples]fr.Element + // sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - copy(sampleScalarsSmallValues[:], sampleScalars[:]) - copy(sampleScalarsRedundant[:], sampleScalars[:]) - - // this means first chunk is going to have more work to do and should be split into several go routines - for i := 0; i < len(sampleScalarsSmallValues); i++ { - if i%5 == 0 { - sampleScalarsSmallValues[i].SetZero() - sampleScalarsSmallValues[i][0] = 1 - } - } - - // bad case for batch affine because scalar distribution might look uniform - // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // to process small batches of additions to flush its queue of conflicted points. - for i := 0; i < len(sampleScalarsRedundant); i += 100 { - for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { - sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - } - } + // copy(sampleScalarsSmallValues[:],sampleScalars[:]) + // copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // // this means first chunk is going to have more work to do and should be split into several go routines + // for i:=0; i < len(sampleScalarsSmallValues);i++ { + // if i % 5 == 0 { + // sampleScalarsSmallValues[i].SetZero() + // sampleScalarsSmallValues[i][0] = 1 + // } + // } + + // // bad case for batch affine because scalar distribution might look uniform + // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // // to process small batches of additions to flush its queue of conflicted points. + // for i:=0; i < len(sampleScalarsRedundant);i+=100 { + // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + // } + // } fillBenchBasesG1(samplePoints[:]) var testPoint G1Affine - for i := 22; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -263,19 +263,19 @@ func BenchmarkMultiExpG1(b *testing.B) { } }) - b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) - } - }) + // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + // } + // }) + + // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + // } + // }) } } @@ -539,38 +539,38 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element - sampleScalarsSmallValues [nbSamples]fr.Element - sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + // sampleScalarsSmallValues [nbSamples]fr.Element + // sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - copy(sampleScalarsSmallValues[:], sampleScalars[:]) - copy(sampleScalarsRedundant[:], sampleScalars[:]) - - // this means first chunk is going to have more work to do and should be split into several go routines - for i := 0; i < len(sampleScalarsSmallValues); i++ { - if i%5 == 0 { - sampleScalarsSmallValues[i].SetZero() - sampleScalarsSmallValues[i][0] = 1 - } - } - - // bad case for batch affine because scalar distribution might look uniform - // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // to process small batches of additions to flush its queue of conflicted points. - for i := 0; i < len(sampleScalarsRedundant); i += 100 { - for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { - sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - } - } + // copy(sampleScalarsSmallValues[:],sampleScalars[:]) + // copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // // this means first chunk is going to have more work to do and should be split into several go routines + // for i:=0; i < len(sampleScalarsSmallValues);i++ { + // if i % 5 == 0 { + // sampleScalarsSmallValues[i].SetZero() + // sampleScalarsSmallValues[i][0] = 1 + // } + // } + + // // bad case for batch affine because scalar distribution might look uniform + // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // // to process small batches of additions to flush its queue of conflicted points. + // for i:=0; i < len(sampleScalarsRedundant);i+=100 { + // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + // } + // } fillBenchBasesG2(samplePoints[:]) var testPoint G2Affine - for i := 22; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -580,19 +580,19 @@ func BenchmarkMultiExpG2(b *testing.B) { } }) - b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) - } - }) + // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + // } + // }) + + // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + // } + // }) } } diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index b06acb685c..6c68b58cdd 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -18,7 +18,6 @@ package bn254 import ( "errors" - "fmt" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bn254/fr" "github.com/consensys/gnark-crypto/internal/parallel" @@ -141,99 +140,79 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC5] case 6: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC6] case 7: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC7] case 8: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC8] case 9: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC9] case 10: - // const batchSize = 80 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC10] - // } - // fmt.Printf("affine \n") + const batchSize = 80 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC10] + } return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 11: - // const batchSize = 150 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC11] - // } - // fmt.Printf("affine \n") + const batchSize = 150 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC11] + } return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 12: - // const batchSize = 200 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC12] - // } - // fmt.Printf("affine \n") + const batchSize = 200 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC12] + } return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 13: - // const batchSize = 350 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC13] - // } - // fmt.Printf("affine \n") + const batchSize = 350 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC13] + } return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] case 14: - // const batchSize = 400 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC14] - // } - // fmt.Printf("affine \n") + const batchSize = 400 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC14] + } return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] case 15: - // const batchSize = 500 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC15] - // } - // fmt.Printf("affine \n") + const batchSize = 500 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC15] + } return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] case 16: - // const batchSize = 640 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC16] - // } - // fmt.Printf("affine \n") + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: // panic("will not happen c != previous values is not generated by templates") @@ -247,8 +226,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // note that buckets is an array allocated on the stack and this is critical for performance // each go routine sends its result in chChunks[i] channel chChunks := make([]chan g1JacExtended, nbChunks) @@ -258,10 +236,8 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - // fmt.Printf("\n") - // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) + // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -419,99 +395,79 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC5] case 6: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC6] case 7: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC7] case 8: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC8] case 9: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC9] case 10: - // const batchSize = 80 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC10] - // } - // fmt.Printf("affine \n") + const batchSize = 80 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC10] + } return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 11: - // const batchSize = 150 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC11] - // } - // fmt.Printf("affine \n") + const batchSize = 150 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC11] + } return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 12: - // const batchSize = 200 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC12] - // } - // fmt.Printf("affine \n") + const batchSize = 200 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC12] + } return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 13: - // const batchSize = 350 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC13] - // } - // fmt.Printf("affine \n") + const batchSize = 350 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC13] + } return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] case 14: - // const batchSize = 400 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC14] - // } - // fmt.Printf("affine \n") + const batchSize = 400 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC14] + } return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] case 15: - // const batchSize = 500 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC15] - // } - // fmt.Printf("affine \n") + const batchSize = 500 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC15] + } return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] case 16: - // const batchSize = 640 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC16] - // } - // fmt.Printf("affine \n") + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: // panic("will not happen c != previous values is not generated by templates") @@ -525,8 +481,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // note that buckets is an array allocated on the stack and this is critical for performance // each go routine sends its result in chChunks[i] channel chChunks := make([]chan g2JacExtended, nbChunks) @@ -536,10 +491,8 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - // fmt.Printf("\n") - // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) + // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -618,21 +571,18 @@ func lastC(c uint64) uint64 { type chunkStat struct { // relative weight of work compared to other chunks. 100.0 -> nominal weight. - weight int - - // average absolute deviation. this is meant to give a sense of statistical - // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) - deviation int + weight float32 - // count the number of buckets that are non zeroes for this chunk - nonZeroBuckets int + // // average absolute deviation. this is meant to give a sense of statistical + // // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + // deviation int - // average ops per non-zero buckets - averageOpsPerBucket int -} + // percentage of bucket filled in the window; + ppBucketFilled float32 + nbBucketFilled int -func (c *chunkStat) String() string { - return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) + // // average ops per non-zero buckets + // averageOpsPerBucket int } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits @@ -721,74 +671,49 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - // abs := func(v int) int { - // if v < 0 { - // return -v - // } - // return v - // } - // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - // var opsPerBucket [1 << 15]int // max value is 16 for c + // indicates if a bucket is hit. + var b bitSetC16 + // digits for the chunk chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] totalOps := 0 - // nz := 0 // non zero buckets count + nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - // bucketID := digit >> 1 - // if digit &1 == 0 { - // bucketID-=1 - // } - // if opsPerBucket[bucketID] == 0 { - // nz++ - // } - // opsPerBucket[bucketID]++ + bucketID := digit >> 1 + if digit&1 == 0 { + bucketID -= 1 + } + if !b[bucketID] { + nz++ + b[bucketID] = true + } } - chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - // chunkStats[chunkID].nonZeroBuckets = nz - - // if nz == 0 { - // return // no ops, only zeroes - // } - - // bound := 1 << (c-1) - // if chunkID == int(nbChunks-1) { - // bound = 1 << (lastC(c)-1) - // } - // mean := totalOps / nz - // aad := 0 - // averageOpsPerBucket := 0 - // for b:=0; b < bound; b++ { - // if opsPerBucket[b] == 0 { - // continue - // } - // aad += abs(opsPerBucket[b] - mean) - // averageOpsPerBucket += opsPerBucket[b] - // } - // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - // chunkStats[chunkID].deviation = aad / nz + chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after + chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1))) + chunkStats[chunkID].nbBucketFilled = nz } }, nbTasks) - totalOps := 0 + totalOps := float32(0.0) for _, stat := range chunkStats { totalOps += stat.weight } - target := totalOps / int(nbChunks) - if target != 0 { + target := totalOps / float32(nbChunks) + if target != 0.0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = (chunkStats[i].weight * 100) / target + chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target } } diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index b3a812b962..c299f039b8 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -222,38 +222,38 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element - sampleScalarsSmallValues [nbSamples]fr.Element - sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + // sampleScalarsSmallValues [nbSamples]fr.Element + // sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - copy(sampleScalarsSmallValues[:], sampleScalars[:]) - copy(sampleScalarsRedundant[:], sampleScalars[:]) - - // this means first chunk is going to have more work to do and should be split into several go routines - for i := 0; i < len(sampleScalarsSmallValues); i++ { - if i%5 == 0 { - sampleScalarsSmallValues[i].SetZero() - sampleScalarsSmallValues[i][0] = 1 - } - } - - // bad case for batch affine because scalar distribution might look uniform - // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // to process small batches of additions to flush its queue of conflicted points. - for i := 0; i < len(sampleScalarsRedundant); i += 100 { - for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { - sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - } - } + // copy(sampleScalarsSmallValues[:],sampleScalars[:]) + // copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // // this means first chunk is going to have more work to do and should be split into several go routines + // for i:=0; i < len(sampleScalarsSmallValues);i++ { + // if i % 5 == 0 { + // sampleScalarsSmallValues[i].SetZero() + // sampleScalarsSmallValues[i][0] = 1 + // } + // } + + // // bad case for batch affine because scalar distribution might look uniform + // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // // to process small batches of additions to flush its queue of conflicted points. + // for i:=0; i < len(sampleScalarsRedundant);i+=100 { + // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + // } + // } fillBenchBasesG1(samplePoints[:]) var testPoint G1Affine - for i := 22; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -263,19 +263,19 @@ func BenchmarkMultiExpG1(b *testing.B) { } }) - b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) - } - }) + // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + // } + // }) + + // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + // } + // }) } } @@ -539,38 +539,38 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element - sampleScalarsSmallValues [nbSamples]fr.Element - sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + // sampleScalarsSmallValues [nbSamples]fr.Element + // sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - copy(sampleScalarsSmallValues[:], sampleScalars[:]) - copy(sampleScalarsRedundant[:], sampleScalars[:]) - - // this means first chunk is going to have more work to do and should be split into several go routines - for i := 0; i < len(sampleScalarsSmallValues); i++ { - if i%5 == 0 { - sampleScalarsSmallValues[i].SetZero() - sampleScalarsSmallValues[i][0] = 1 - } - } - - // bad case for batch affine because scalar distribution might look uniform - // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // to process small batches of additions to flush its queue of conflicted points. - for i := 0; i < len(sampleScalarsRedundant); i += 100 { - for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { - sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - } - } + // copy(sampleScalarsSmallValues[:],sampleScalars[:]) + // copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // // this means first chunk is going to have more work to do and should be split into several go routines + // for i:=0; i < len(sampleScalarsSmallValues);i++ { + // if i % 5 == 0 { + // sampleScalarsSmallValues[i].SetZero() + // sampleScalarsSmallValues[i][0] = 1 + // } + // } + + // // bad case for batch affine because scalar distribution might look uniform + // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // // to process small batches of additions to flush its queue of conflicted points. + // for i:=0; i < len(sampleScalarsRedundant);i+=100 { + // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + // } + // } fillBenchBasesG2(samplePoints[:]) var testPoint G2Affine - for i := 22; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -580,19 +580,19 @@ func BenchmarkMultiExpG2(b *testing.B) { } }) - b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) - } - }) + // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + // } + // }) + + // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + // } + // }) } } diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index 1bc8b9e0ef..e700b666bf 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -18,7 +18,6 @@ package bw6633 import ( "errors" - "fmt" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-633/fr" "github.com/consensys/gnark-crypto/internal/parallel" @@ -141,24 +140,19 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC5] case 8: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC8] case 16: - // const batchSize = 640 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC16] - // } - // fmt.Printf("affine \n") + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: // panic("will not happen c != previous values is not generated by templates") @@ -172,8 +166,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // note that buckets is an array allocated on the stack and this is critical for performance // each go routine sends its result in chChunks[i] channel chChunks := make([]chan g1JacExtended, nbChunks) @@ -183,10 +176,8 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - // fmt.Printf("\n") - // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) + // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -344,24 +335,19 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC5] case 8: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC8] case 16: - // const batchSize = 640 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC16] - // } - // fmt.Printf("affine \n") + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: // panic("will not happen c != previous values is not generated by templates") @@ -375,8 +361,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // note that buckets is an array allocated on the stack and this is critical for performance // each go routine sends its result in chChunks[i] channel chChunks := make([]chan g2JacExtended, nbChunks) @@ -386,10 +371,8 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - // fmt.Printf("\n") - // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) + // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -468,21 +451,18 @@ func lastC(c uint64) uint64 { type chunkStat struct { // relative weight of work compared to other chunks. 100.0 -> nominal weight. - weight int + weight float32 - // average absolute deviation. this is meant to give a sense of statistical - // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) - deviation int + // // average absolute deviation. this is meant to give a sense of statistical + // // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + // deviation int - // count the number of buckets that are non zeroes for this chunk - nonZeroBuckets int - - // average ops per non-zero buckets - averageOpsPerBucket int -} + // percentage of bucket filled in the window; + ppBucketFilled float32 + nbBucketFilled int -func (c *chunkStat) String() string { - return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) + // // average ops per non-zero buckets + // averageOpsPerBucket int } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits @@ -571,74 +551,49 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - // abs := func(v int) int { - // if v < 0 { - // return -v - // } - // return v - // } - // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - // var opsPerBucket [1 << 15]int // max value is 16 for c + // indicates if a bucket is hit. + var b bitSetC16 + // digits for the chunk chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] totalOps := 0 - // nz := 0 // non zero buckets count + nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - // bucketID := digit >> 1 - // if digit &1 == 0 { - // bucketID-=1 - // } - // if opsPerBucket[bucketID] == 0 { - // nz++ - // } - // opsPerBucket[bucketID]++ + bucketID := digit >> 1 + if digit&1 == 0 { + bucketID -= 1 + } + if !b[bucketID] { + nz++ + b[bucketID] = true + } } - chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - // chunkStats[chunkID].nonZeroBuckets = nz - - // if nz == 0 { - // return // no ops, only zeroes - // } - - // bound := 1 << (c-1) - // if chunkID == int(nbChunks-1) { - // bound = 1 << (lastC(c)-1) - // } - // mean := totalOps / nz - // aad := 0 - // averageOpsPerBucket := 0 - // for b:=0; b < bound; b++ { - // if opsPerBucket[b] == 0 { - // continue - // } - // aad += abs(opsPerBucket[b] - mean) - // averageOpsPerBucket += opsPerBucket[b] - // } - // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - // chunkStats[chunkID].deviation = aad / nz + chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after + chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1))) + chunkStats[chunkID].nbBucketFilled = nz } }, nbTasks) - totalOps := 0 + totalOps := float32(0.0) for _, stat := range chunkStats { totalOps += stat.weight } - target := totalOps / int(nbChunks) - if target != 0 { + target := totalOps / float32(nbChunks) + if target != 0.0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = (chunkStats[i].weight * 100) / target + chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target } } diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index 95312917d9..bee27d7123 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -222,38 +222,38 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element - sampleScalarsSmallValues [nbSamples]fr.Element - sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + // sampleScalarsSmallValues [nbSamples]fr.Element + // sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - copy(sampleScalarsSmallValues[:], sampleScalars[:]) - copy(sampleScalarsRedundant[:], sampleScalars[:]) - - // this means first chunk is going to have more work to do and should be split into several go routines - for i := 0; i < len(sampleScalarsSmallValues); i++ { - if i%5 == 0 { - sampleScalarsSmallValues[i].SetZero() - sampleScalarsSmallValues[i][0] = 1 - } - } - - // bad case for batch affine because scalar distribution might look uniform - // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // to process small batches of additions to flush its queue of conflicted points. - for i := 0; i < len(sampleScalarsRedundant); i += 100 { - for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { - sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - } - } + // copy(sampleScalarsSmallValues[:],sampleScalars[:]) + // copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // // this means first chunk is going to have more work to do and should be split into several go routines + // for i:=0; i < len(sampleScalarsSmallValues);i++ { + // if i % 5 == 0 { + // sampleScalarsSmallValues[i].SetZero() + // sampleScalarsSmallValues[i][0] = 1 + // } + // } + + // // bad case for batch affine because scalar distribution might look uniform + // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // // to process small batches of additions to flush its queue of conflicted points. + // for i:=0; i < len(sampleScalarsRedundant);i+=100 { + // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + // } + // } fillBenchBasesG1(samplePoints[:]) var testPoint G1Affine - for i := 22; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -263,19 +263,19 @@ func BenchmarkMultiExpG1(b *testing.B) { } }) - b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) - } - }) + // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + // } + // }) + + // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + // } + // }) } } @@ -539,38 +539,38 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element - sampleScalarsSmallValues [nbSamples]fr.Element - sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + // sampleScalarsSmallValues [nbSamples]fr.Element + // sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - copy(sampleScalarsSmallValues[:], sampleScalars[:]) - copy(sampleScalarsRedundant[:], sampleScalars[:]) - - // this means first chunk is going to have more work to do and should be split into several go routines - for i := 0; i < len(sampleScalarsSmallValues); i++ { - if i%5 == 0 { - sampleScalarsSmallValues[i].SetZero() - sampleScalarsSmallValues[i][0] = 1 - } - } - - // bad case for batch affine because scalar distribution might look uniform - // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // to process small batches of additions to flush its queue of conflicted points. - for i := 0; i < len(sampleScalarsRedundant); i += 100 { - for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { - sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - } - } + // copy(sampleScalarsSmallValues[:],sampleScalars[:]) + // copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // // this means first chunk is going to have more work to do and should be split into several go routines + // for i:=0; i < len(sampleScalarsSmallValues);i++ { + // if i % 5 == 0 { + // sampleScalarsSmallValues[i].SetZero() + // sampleScalarsSmallValues[i][0] = 1 + // } + // } + + // // bad case for batch affine because scalar distribution might look uniform + // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // // to process small batches of additions to flush its queue of conflicted points. + // for i:=0; i < len(sampleScalarsRedundant);i+=100 { + // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + // } + // } fillBenchBasesG2(samplePoints[:]) var testPoint G2Affine - for i := 22; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -580,19 +580,19 @@ func BenchmarkMultiExpG2(b *testing.B) { } }) - b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) - } - }) + // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + // } + // }) + + // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + // } + // }) } } diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index e695c8791f..c81d3d8c0b 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -18,7 +18,6 @@ package bw6756 import ( "errors" - "fmt" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-756/fr" "github.com/consensys/gnark-crypto/internal/parallel" @@ -141,24 +140,19 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC5] case 8: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC8] case 16: - // const batchSize = 640 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC16] - // } - // fmt.Printf("affine \n") + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: // panic("will not happen c != previous values is not generated by templates") @@ -172,8 +166,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // note that buckets is an array allocated on the stack and this is critical for performance // each go routine sends its result in chChunks[i] channel chChunks := make([]chan g1JacExtended, nbChunks) @@ -183,10 +176,8 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - // fmt.Printf("\n") - // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) + // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -344,24 +335,19 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC5] case 8: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC8] case 16: - // const batchSize = 640 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC16] - // } - // fmt.Printf("affine \n") + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: // panic("will not happen c != previous values is not generated by templates") @@ -375,8 +361,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // note that buckets is an array allocated on the stack and this is critical for performance // each go routine sends its result in chChunks[i] channel chChunks := make([]chan g2JacExtended, nbChunks) @@ -386,10 +371,8 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - // fmt.Printf("\n") - // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) + // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -468,21 +451,18 @@ func lastC(c uint64) uint64 { type chunkStat struct { // relative weight of work compared to other chunks. 100.0 -> nominal weight. - weight int + weight float32 - // average absolute deviation. this is meant to give a sense of statistical - // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) - deviation int + // // average absolute deviation. this is meant to give a sense of statistical + // // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + // deviation int - // count the number of buckets that are non zeroes for this chunk - nonZeroBuckets int - - // average ops per non-zero buckets - averageOpsPerBucket int -} + // percentage of bucket filled in the window; + ppBucketFilled float32 + nbBucketFilled int -func (c *chunkStat) String() string { - return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) + // // average ops per non-zero buckets + // averageOpsPerBucket int } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits @@ -571,74 +551,49 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - // abs := func(v int) int { - // if v < 0 { - // return -v - // } - // return v - // } - // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - // var opsPerBucket [1 << 15]int // max value is 16 for c + // indicates if a bucket is hit. + var b bitSetC16 + // digits for the chunk chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] totalOps := 0 - // nz := 0 // non zero buckets count + nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - // bucketID := digit >> 1 - // if digit &1 == 0 { - // bucketID-=1 - // } - // if opsPerBucket[bucketID] == 0 { - // nz++ - // } - // opsPerBucket[bucketID]++ + bucketID := digit >> 1 + if digit&1 == 0 { + bucketID -= 1 + } + if !b[bucketID] { + nz++ + b[bucketID] = true + } } - chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - // chunkStats[chunkID].nonZeroBuckets = nz - - // if nz == 0 { - // return // no ops, only zeroes - // } - - // bound := 1 << (c-1) - // if chunkID == int(nbChunks-1) { - // bound = 1 << (lastC(c)-1) - // } - // mean := totalOps / nz - // aad := 0 - // averageOpsPerBucket := 0 - // for b:=0; b < bound; b++ { - // if opsPerBucket[b] == 0 { - // continue - // } - // aad += abs(opsPerBucket[b] - mean) - // averageOpsPerBucket += opsPerBucket[b] - // } - // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - // chunkStats[chunkID].deviation = aad / nz + chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after + chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1))) + chunkStats[chunkID].nbBucketFilled = nz } }, nbTasks) - totalOps := 0 + totalOps := float32(0.0) for _, stat := range chunkStats { totalOps += stat.weight } - target := totalOps / int(nbChunks) - if target != 0 { + target := totalOps / float32(nbChunks) + if target != 0.0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = (chunkStats[i].weight * 100) / target + chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target } } diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index a94008b836..4fce6462a5 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -222,38 +222,38 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element - sampleScalarsSmallValues [nbSamples]fr.Element - sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + // sampleScalarsSmallValues [nbSamples]fr.Element + // sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - copy(sampleScalarsSmallValues[:], sampleScalars[:]) - copy(sampleScalarsRedundant[:], sampleScalars[:]) - - // this means first chunk is going to have more work to do and should be split into several go routines - for i := 0; i < len(sampleScalarsSmallValues); i++ { - if i%5 == 0 { - sampleScalarsSmallValues[i].SetZero() - sampleScalarsSmallValues[i][0] = 1 - } - } - - // bad case for batch affine because scalar distribution might look uniform - // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // to process small batches of additions to flush its queue of conflicted points. - for i := 0; i < len(sampleScalarsRedundant); i += 100 { - for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { - sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - } - } + // copy(sampleScalarsSmallValues[:],sampleScalars[:]) + // copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // // this means first chunk is going to have more work to do and should be split into several go routines + // for i:=0; i < len(sampleScalarsSmallValues);i++ { + // if i % 5 == 0 { + // sampleScalarsSmallValues[i].SetZero() + // sampleScalarsSmallValues[i][0] = 1 + // } + // } + + // // bad case for batch affine because scalar distribution might look uniform + // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // // to process small batches of additions to flush its queue of conflicted points. + // for i:=0; i < len(sampleScalarsRedundant);i+=100 { + // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + // } + // } fillBenchBasesG1(samplePoints[:]) var testPoint G1Affine - for i := 22; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -263,19 +263,19 @@ func BenchmarkMultiExpG1(b *testing.B) { } }) - b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) - } - }) + // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + // } + // }) + + // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + // } + // }) } } @@ -539,38 +539,38 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element - sampleScalarsSmallValues [nbSamples]fr.Element - sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + // sampleScalarsSmallValues [nbSamples]fr.Element + // sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - copy(sampleScalarsSmallValues[:], sampleScalars[:]) - copy(sampleScalarsRedundant[:], sampleScalars[:]) - - // this means first chunk is going to have more work to do and should be split into several go routines - for i := 0; i < len(sampleScalarsSmallValues); i++ { - if i%5 == 0 { - sampleScalarsSmallValues[i].SetZero() - sampleScalarsSmallValues[i][0] = 1 - } - } - - // bad case for batch affine because scalar distribution might look uniform - // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // to process small batches of additions to flush its queue of conflicted points. - for i := 0; i < len(sampleScalarsRedundant); i += 100 { - for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { - sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - } - } + // copy(sampleScalarsSmallValues[:],sampleScalars[:]) + // copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // // this means first chunk is going to have more work to do and should be split into several go routines + // for i:=0; i < len(sampleScalarsSmallValues);i++ { + // if i % 5 == 0 { + // sampleScalarsSmallValues[i].SetZero() + // sampleScalarsSmallValues[i][0] = 1 + // } + // } + + // // bad case for batch affine because scalar distribution might look uniform + // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // // to process small batches of additions to flush its queue of conflicted points. + // for i:=0; i < len(sampleScalarsRedundant);i+=100 { + // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + // } + // } fillBenchBasesG2(samplePoints[:]) var testPoint G2Affine - for i := 22; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -580,19 +580,19 @@ func BenchmarkMultiExpG2(b *testing.B) { } }) - b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) - } - }) + // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + // } + // }) + + // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + // } + // }) } } diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index b89e06e566..d928a013fd 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -18,7 +18,6 @@ package bw6761 import ( "errors" - "fmt" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" "github.com/consensys/gnark-crypto/internal/parallel" @@ -141,24 +140,19 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC5] case 8: - // fmt.Printf("jacobian \n") return processChunkG1Jacobian[bucketg1JacExtendedC8] case 16: - // const batchSize = 640 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG1Jacobian[bucketg1JacExtendedC16] - // } - // fmt.Printf("affine \n") + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] default: // panic("will not happen c != previous values is not generated by templates") @@ -172,8 +166,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // note that buckets is an array allocated on the stack and this is critical for performance // each go routine sends its result in chChunks[i] channel chChunks := make([]chan g1JacExtended, nbChunks) @@ -183,10 +176,8 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - // fmt.Printf("\n") - // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) + // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -344,24 +335,19 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch switch c { case 4: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC5] case 8: - // fmt.Printf("jacobian \n") return processChunkG2Jacobian[bucketg2JacExtendedC8] case 16: - // const batchSize = 640 - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunkG2Jacobian[bucketg2JacExtendedC16] - // } - // fmt.Printf("affine \n") + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] default: // panic("will not happen c != previous values is not generated by templates") @@ -375,8 +361,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // note that buckets is an array allocated on the stack and this is critical for performance // each go routine sends its result in chChunks[i] channel chChunks := make([]chan g2JacExtended, nbChunks) @@ -386,10 +371,8 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - // fmt.Printf("\n") - // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) + // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -468,21 +451,18 @@ func lastC(c uint64) uint64 { type chunkStat struct { // relative weight of work compared to other chunks. 100.0 -> nominal weight. - weight int + weight float32 - // average absolute deviation. this is meant to give a sense of statistical - // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) - deviation int + // // average absolute deviation. this is meant to give a sense of statistical + // // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + // deviation int - // count the number of buckets that are non zeroes for this chunk - nonZeroBuckets int - - // average ops per non-zero buckets - averageOpsPerBucket int -} + // percentage of bucket filled in the window; + ppBucketFilled float32 + nbBucketFilled int -func (c *chunkStat) String() string { - return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) + // // average ops per non-zero buckets + // averageOpsPerBucket int } // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits @@ -571,74 +551,49 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - // abs := func(v int) int { - // if v < 0 { - // return -v - // } - // return v - // } - // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - // var opsPerBucket [1 << 15]int // max value is 16 for c + // indicates if a bucket is hit. + var b bitSetC16 + // digits for the chunk chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] totalOps := 0 - // nz := 0 // non zero buckets count + nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - // bucketID := digit >> 1 - // if digit &1 == 0 { - // bucketID-=1 - // } - // if opsPerBucket[bucketID] == 0 { - // nz++ - // } - // opsPerBucket[bucketID]++ + bucketID := digit >> 1 + if digit&1 == 0 { + bucketID -= 1 + } + if !b[bucketID] { + nz++ + b[bucketID] = true + } } - chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - // chunkStats[chunkID].nonZeroBuckets = nz - - // if nz == 0 { - // return // no ops, only zeroes - // } - - // bound := 1 << (c-1) - // if chunkID == int(nbChunks-1) { - // bound = 1 << (lastC(c)-1) - // } - // mean := totalOps / nz - // aad := 0 - // averageOpsPerBucket := 0 - // for b:=0; b < bound; b++ { - // if opsPerBucket[b] == 0 { - // continue - // } - // aad += abs(opsPerBucket[b] - mean) - // averageOpsPerBucket += opsPerBucket[b] - // } - // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - // chunkStats[chunkID].deviation = aad / nz + chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after + chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1))) + chunkStats[chunkID].nbBucketFilled = nz } }, nbTasks) - totalOps := 0 + totalOps := float32(0.0) for _, stat := range chunkStats { totalOps += stat.weight } - target := totalOps / int(nbChunks) - if target != 0 { + target := totalOps / float32(nbChunks) + if target != 0.0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = (chunkStats[i].weight * 100) / target + chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target } } diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index 7d9abce749..d21931beca 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -222,38 +222,38 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element - sampleScalarsSmallValues [nbSamples]fr.Element - sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + // sampleScalarsSmallValues [nbSamples]fr.Element + // sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - copy(sampleScalarsSmallValues[:], sampleScalars[:]) - copy(sampleScalarsRedundant[:], sampleScalars[:]) - - // this means first chunk is going to have more work to do and should be split into several go routines - for i := 0; i < len(sampleScalarsSmallValues); i++ { - if i%5 == 0 { - sampleScalarsSmallValues[i].SetZero() - sampleScalarsSmallValues[i][0] = 1 - } - } - - // bad case for batch affine because scalar distribution might look uniform - // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // to process small batches of additions to flush its queue of conflicted points. - for i := 0; i < len(sampleScalarsRedundant); i += 100 { - for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { - sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - } - } + // copy(sampleScalarsSmallValues[:],sampleScalars[:]) + // copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // // this means first chunk is going to have more work to do and should be split into several go routines + // for i:=0; i < len(sampleScalarsSmallValues);i++ { + // if i % 5 == 0 { + // sampleScalarsSmallValues[i].SetZero() + // sampleScalarsSmallValues[i][0] = 1 + // } + // } + + // // bad case for batch affine because scalar distribution might look uniform + // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // // to process small batches of additions to flush its queue of conflicted points. + // for i:=0; i < len(sampleScalarsRedundant);i+=100 { + // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + // } + // } fillBenchBasesG1(samplePoints[:]) var testPoint G1Affine - for i := 22; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -263,19 +263,19 @@ func BenchmarkMultiExpG1(b *testing.B) { } }) - b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) - } - }) + // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + // } + // }) + + // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + // } + // }) } } @@ -539,38 +539,38 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element - sampleScalarsSmallValues [nbSamples]fr.Element - sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + // sampleScalarsSmallValues [nbSamples]fr.Element + // sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - copy(sampleScalarsSmallValues[:], sampleScalars[:]) - copy(sampleScalarsRedundant[:], sampleScalars[:]) - - // this means first chunk is going to have more work to do and should be split into several go routines - for i := 0; i < len(sampleScalarsSmallValues); i++ { - if i%5 == 0 { - sampleScalarsSmallValues[i].SetZero() - sampleScalarsSmallValues[i][0] = 1 - } - } - - // bad case for batch affine because scalar distribution might look uniform - // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // to process small batches of additions to flush its queue of conflicted points. - for i := 0; i < len(sampleScalarsRedundant); i += 100 { - for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { - sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - } - } + // copy(sampleScalarsSmallValues[:],sampleScalars[:]) + // copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // // this means first chunk is going to have more work to do and should be split into several go routines + // for i:=0; i < len(sampleScalarsSmallValues);i++ { + // if i % 5 == 0 { + // sampleScalarsSmallValues[i].SetZero() + // sampleScalarsSmallValues[i][0] = 1 + // } + // } + + // // bad case for batch affine because scalar distribution might look uniform + // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // // to process small batches of additions to flush its queue of conflicted points. + // for i:=0; i < len(sampleScalarsRedundant);i+=100 { + // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + // } + // } fillBenchBasesG2(samplePoints[:]) var testPoint G2Affine - for i := 22; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -580,19 +580,19 @@ func BenchmarkMultiExpG2(b *testing.B) { } }) - b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) - } - }) + // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + // } + // }) + + // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + // } + // }) } } diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index b9563c9a61..d1826081ce 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -14,7 +14,6 @@ import ( "errors" "math" "runtime" - "fmt" ) {{ template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}} @@ -57,22 +56,20 @@ func lastC(c uint64) uint64 { type chunkStat struct { // relative weight of work compared to other chunks. 100.0 -> nominal weight. - weight int + weight float32 - // average absolute deviation. this is meant to give a sense of statistical - // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) - deviation int + // // average absolute deviation. this is meant to give a sense of statistical + // // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) + // deviation int - // count the number of buckets that are non zeroes for this chunk - nonZeroBuckets int + // percentage of bucket filled in the window; + ppBucketFilled float32 + nbBucketFilled int - // average ops per non-zero buckets - averageOpsPerBucket int + // // average ops per non-zero buckets + // averageOpsPerBucket int } -func (c *chunkStat) String() string { - return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket) -} // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits @@ -164,75 +161,50 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks }, nbTasks) - // abs := func(v int) int { - // if v < 0 { - // return -v - // } - // return v - // } - + // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - // var opsPerBucket [1 << 15]int // max value is 16 for c + // indicates if a bucket is hit. + var b bitSetC16 + // digits for the chunk chunkDigits := digits[chunkID*len(scalars):(chunkID+1)*len(scalars)] totalOps := 0 - // nz := 0 // non zero buckets count + nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { continue } totalOps++ - // bucketID := digit >> 1 - // if digit &1 == 0 { - // bucketID-=1 - // } - // if opsPerBucket[bucketID] == 0 { - // nz++ - // } - // opsPerBucket[bucketID]++ + bucketID := digit >> 1 + if digit &1 == 0 { + bucketID-=1 + } + if !b[bucketID] { + nz++ + b[bucketID] = true + } } - chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after - // chunkStats[chunkID].nonZeroBuckets = nz - - - // if nz == 0 { - // return // no ops, only zeroes - // } - - // bound := 1 << (c-1) - // if chunkID == int(nbChunks-1) { - // bound = 1 << (lastC(c)-1) - // } - // mean := totalOps / nz - // aad := 0 - // averageOpsPerBucket := 0 - // for b:=0; b < bound; b++ { - // if opsPerBucket[b] == 0 { - // continue - // } - // aad += abs(opsPerBucket[b] - mean) - // averageOpsPerBucket += opsPerBucket[b] - // } - // chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz - // chunkStats[chunkID].deviation = aad / nz + chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after + chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1 << (c-1))) + chunkStats[chunkID].nbBucketFilled = nz } }, nbTasks) - totalOps := 0 + totalOps := float32(0.0) for _, stat := range chunkStats { totalOps+=stat.weight } - target := totalOps / int(nbChunks) - if target != 0 { + target := totalOps / float32(nbChunks) + if target != 0.0 { // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. for i := 0; i < len(chunkStats); i++ { - chunkStats[i].weight = (chunkStats[i].weight * 100) / target + chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target } } @@ -491,18 +463,15 @@ func getChunkProcessor{{ $.UPointName }}(c uint64, stat chunkStat) func(chunkID {{range $c := $.CRange}} case {{$c}}: {{- if le $c 9}} - // fmt.Printf("jacobian \n") return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] {{- else}} - // const batchSize = {{batchSize $c}} - // status: we are losing in perf here in the nominal case. - // stat.deviation seems not good. - // edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10)) || (stat.deviation >= 20) - // if edgeCaseAffine { - // // fmt.Printf("jacobian \n") - // return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] - // } - // fmt.Printf("affine \n") + const batchSize = {{batchSize $c}} + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] + } return processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TJacobianExtended }}C{{$c}}, bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}] {{- end}} {{- end}} @@ -518,8 +487,7 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T // for each chunk, spawn one go routine that'll loop through all the scalars in the // corresponding bit-window - // note that buckets is an array allocated on the stack (for most sizes of c) and this is - // critical for performance + // note that buckets is an array allocated on the stack and this is critical for performance // each go routine sends its result in chChunks[i] channel chChunks := make([]chan {{ $.TJacobianExtended }}, nbChunks) @@ -529,10 +497,8 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) - // fmt.Printf("\n") - // fmt.Println("n", n) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String()) + // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessor{{ $.UPointName }}(c, chunkStats[j]) if j == int(nbChunks - 1) { processChunk = getChunkProcessor{{ $.UPointName }}(lastC(c), chunkStats[j]) diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index 91af5a996b..ed18ab1c46 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -237,37 +237,37 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { var ( samplePoints [nbSamples]{{ $.TAffine }} sampleScalars [nbSamples]fr.Element - sampleScalarsSmallValues [nbSamples]fr.Element - sampleScalarsRedundant [nbSamples]fr.Element + // sampleScalarsSmallValues [nbSamples]fr.Element + // sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - copy(sampleScalarsSmallValues[:],sampleScalars[:]) - copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // this means first chunk is going to have more work to do and should be split into several go routines - for i:=0; i < len(sampleScalarsSmallValues);i++ { - if i % 5 == 0 { - sampleScalarsSmallValues[i].SetZero() - sampleScalarsSmallValues[i][0] = 1 - } - } - - // bad case for batch affine because scalar distribution might look uniform - // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // to process small batches of additions to flush its queue of conflicted points. - for i:=0; i < len(sampleScalarsRedundant);i+=100 { - for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - } - } + // copy(sampleScalarsSmallValues[:],sampleScalars[:]) + // copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // // this means first chunk is going to have more work to do and should be split into several go routines + // for i:=0; i < len(sampleScalarsSmallValues);i++ { + // if i % 5 == 0 { + // sampleScalarsSmallValues[i].SetZero() + // sampleScalarsSmallValues[i][0] = 1 + // } + // } + + // // bad case for batch affine because scalar distribution might look uniform + // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // // to process small batches of additions to flush its queue of conflicted points. + // for i:=0; i < len(sampleScalarsRedundant);i+=100 { + // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + // } + // } fillBenchBases{{ toUpper $.PointName }}(samplePoints[:]) var testPoint {{ $.TAffine }} - for i := 22; i <= pow; i++ { + for i := 15; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -277,19 +277,19 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { } }) - b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - } - }) - - b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - b.ResetTimer() - for j := 0; j < b.N; j++ { - testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - } - }) + // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + // } + // }) + + // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + // b.ResetTimer() + // for j := 0; j < b.N; j++ { + // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + // } + // }) } } From df5fcdf568fdf84fd5ed160667677e43cdc4a79c Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Wed, 16 Nov 2022 11:00:16 -0600 Subject: [PATCH 31/43] style: added comments and clean msm --- ecc/bls12-377/multiexp_affine.go | 136 ++++++++---------- ecc/bls12-377/multiexp_test.go | 8 +- ecc/bls12-378/multiexp_affine.go | 136 ++++++++---------- ecc/bls12-378/multiexp_test.go | 8 +- ecc/bls12-381/multiexp_affine.go | 136 ++++++++---------- ecc/bls12-381/multiexp_test.go | 8 +- ecc/bls24-315/multiexp_affine.go | 136 ++++++++---------- ecc/bls24-315/multiexp_test.go | 8 +- ecc/bls24-317/multiexp_affine.go | 136 ++++++++---------- ecc/bls24-317/multiexp_test.go | 8 +- ecc/bn254/multiexp_affine.go | 136 ++++++++---------- ecc/bn254/multiexp_test.go | 8 +- ecc/bw6-633/multiexp_affine.go | 136 ++++++++---------- ecc/bw6-633/multiexp_test.go | 8 +- ecc/bw6-756/multiexp_affine.go | 136 ++++++++---------- ecc/bw6-756/multiexp_test.go | 8 +- ecc/bw6-761/multiexp_affine.go | 136 ++++++++---------- ecc/bw6-761/multiexp_test.go | 8 +- .../ecc/template/multiexp_affine.go.tmpl | 75 ++++------ .../ecc/template/tests/multiexp.go.tmpl | 8 +- 20 files changed, 571 insertions(+), 808 deletions(-) diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index b2c167e9bc..1504538cf5 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -23,13 +23,7 @@ import ( type batchOpG1Affine struct { bucketID uint16 - // pointID uint32 - point G1Affine -} - -func (o batchOpG1Affine) isNeg() bool { - return false - // return o.pointID&1 == 1 + point G1Affine } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -45,10 +39,23 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP points []G1Affine, digits []uint16) { - // init the buckets + // the batch affine addition needs independent points; in other words, for a window of batchSize + // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying + // to add 2 different points to the same bucket), then we push the conflicted point to a queue. + // each time the batch is full, we execute it, and tentatively put the points (if not conflict) + // from the top of the queue into the next batch. + // if the queue is full, we "flush it"; we sequentially add the points to the buckets in + // g1JacExtended coordinates. + // The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random + // input, the number of conflicts is going to be low, and the element added to the queue should be immediatly + // processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to + // non-batch-affine version. + + // note that we have 2 sets of buckets + // 1 in G1Affine used with the batch affine additions + // 1 in g1JacExtended used in case the queue of conflicting points var buckets B var bucketsJE BJE - var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -63,13 +70,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) - // var queue [batchSize]batchOpG1Affine batchSize := len(P) - isFull := func() bool { - return cptAdd == batchSize - } + isFull := func() bool { return cptAdd == batchSize } executeAndReset := func() { batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) @@ -77,12 +81,16 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG1Affine) { - // @precondition: ensures bucket is not "used" in current batch + // @precondition: must ensures bucket is not "used" in current batch + // note that there is a bit of duplicate logic between add and addFromQueue + // the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature) + // the compiler will put the queue on the heap. BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P - if !bucketSet[op.bucketID] { - bucketSet[op.bucketID] = true + if BK.IsInfinity() { BK.Set(&op.point) return } @@ -95,7 +103,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP return } BK.setInfinity() - bucketSet[op.bucketID] = false return } @@ -109,13 +116,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if !bucketSet[bucketID] { + if BK.IsInfinity() { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } - bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -127,14 +133,11 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() - bucketSet[bucketID] = false } - return } if isAdd { BK.setInfinity() - bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -158,17 +161,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP qID = 0 } - processQueue := func() { + processTopQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { return } addFromQueue(queue[i]) - // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) - // if isFull() { - // executeAndReset() - // } - // queue[i] = queue[qID-1] + // len(queue) < batchSize so no need to check for full batch. qID-- } } @@ -190,10 +189,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - // queue[qID].pointID = uint32(i << 1) queue[qID].point.Set(&points[i]) } else { - // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -209,7 +206,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() // TODO top queue only + processTopQueue() } } @@ -218,21 +215,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // empty the queue flushQueue() - // for qID != 0 { - // processQueue() - // executeAndReset() - // } // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g1JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if bucketSet[k] { - runningSum.addMixed(&buckets[k]) - } + runningSum.addMixed(&buckets[k]) if !bucketsJE[k].ZZ.IsZero() { runningSum.add(&bucketsJE[k]) } @@ -352,13 +342,7 @@ type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 - // pointID uint32 - point G2Affine -} - -func (o batchOpG2Affine) isNeg() bool { - return false - // return o.pointID&1 == 1 + point G2Affine } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -374,10 +358,23 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP points []G2Affine, digits []uint16) { - // init the buckets + // the batch affine addition needs independent points; in other words, for a window of batchSize + // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying + // to add 2 different points to the same bucket), then we push the conflicted point to a queue. + // each time the batch is full, we execute it, and tentatively put the points (if not conflict) + // from the top of the queue into the next batch. + // if the queue is full, we "flush it"; we sequentially add the points to the buckets in + // g2JacExtended coordinates. + // The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random + // input, the number of conflicts is going to be low, and the element added to the queue should be immediatly + // processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to + // non-batch-affine version. + + // note that we have 2 sets of buckets + // 1 in G2Affine used with the batch affine additions + // 1 in g2JacExtended used in case the queue of conflicting points var buckets B var bucketsJE BJE - var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -392,13 +389,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) - // var queue [batchSize]batchOpG2Affine batchSize := len(P) - isFull := func() bool { - return cptAdd == batchSize - } + isFull := func() bool { return cptAdd == batchSize } executeAndReset := func() { batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) @@ -406,12 +400,16 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG2Affine) { - // @precondition: ensures bucket is not "used" in current batch + // @precondition: must ensures bucket is not "used" in current batch + // note that there is a bit of duplicate logic between add and addFromQueue + // the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature) + // the compiler will put the queue on the heap. BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P - if !bucketSet[op.bucketID] { - bucketSet[op.bucketID] = true + if BK.IsInfinity() { BK.Set(&op.point) return } @@ -424,7 +422,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP return } BK.setInfinity() - bucketSet[op.bucketID] = false return } @@ -438,13 +435,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if !bucketSet[bucketID] { + if BK.IsInfinity() { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } - bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -456,14 +452,11 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() - bucketSet[bucketID] = false } - return } if isAdd { BK.setInfinity() - bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -487,17 +480,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP qID = 0 } - processQueue := func() { + processTopQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { return } addFromQueue(queue[i]) - // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) - // if isFull() { - // executeAndReset() - // } - // queue[i] = queue[qID-1] + // len(queue) < batchSize so no need to check for full batch. qID-- } } @@ -519,10 +508,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - // queue[qID].pointID = uint32(i << 1) queue[qID].point.Set(&points[i]) } else { - // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -538,7 +525,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() // TODO top queue only + processTopQueue() } } @@ -547,21 +534,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // empty the queue flushQueue() - // for qID != 0 { - // processQueue() - // executeAndReset() - // } // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g2JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if bucketSet[k] { - runningSum.addMixed(&buckets[k]) - } + runningSum.addMixed(&buckets[k]) if !bucketsJE[k].ZZ.IsZero() { runningSum.add(&bucketsJE[k]) } diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index 8a036b0f54..8700cfd9b3 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -669,11 +669,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index af73df4882..8299397508 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -23,13 +23,7 @@ import ( type batchOpG1Affine struct { bucketID uint16 - // pointID uint32 - point G1Affine -} - -func (o batchOpG1Affine) isNeg() bool { - return false - // return o.pointID&1 == 1 + point G1Affine } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -45,10 +39,23 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP points []G1Affine, digits []uint16) { - // init the buckets + // the batch affine addition needs independent points; in other words, for a window of batchSize + // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying + // to add 2 different points to the same bucket), then we push the conflicted point to a queue. + // each time the batch is full, we execute it, and tentatively put the points (if not conflict) + // from the top of the queue into the next batch. + // if the queue is full, we "flush it"; we sequentially add the points to the buckets in + // g1JacExtended coordinates. + // The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random + // input, the number of conflicts is going to be low, and the element added to the queue should be immediatly + // processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to + // non-batch-affine version. + + // note that we have 2 sets of buckets + // 1 in G1Affine used with the batch affine additions + // 1 in g1JacExtended used in case the queue of conflicting points var buckets B var bucketsJE BJE - var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -63,13 +70,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) - // var queue [batchSize]batchOpG1Affine batchSize := len(P) - isFull := func() bool { - return cptAdd == batchSize - } + isFull := func() bool { return cptAdd == batchSize } executeAndReset := func() { batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) @@ -77,12 +81,16 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG1Affine) { - // @precondition: ensures bucket is not "used" in current batch + // @precondition: must ensures bucket is not "used" in current batch + // note that there is a bit of duplicate logic between add and addFromQueue + // the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature) + // the compiler will put the queue on the heap. BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P - if !bucketSet[op.bucketID] { - bucketSet[op.bucketID] = true + if BK.IsInfinity() { BK.Set(&op.point) return } @@ -95,7 +103,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP return } BK.setInfinity() - bucketSet[op.bucketID] = false return } @@ -109,13 +116,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if !bucketSet[bucketID] { + if BK.IsInfinity() { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } - bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -127,14 +133,11 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() - bucketSet[bucketID] = false } - return } if isAdd { BK.setInfinity() - bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -158,17 +161,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP qID = 0 } - processQueue := func() { + processTopQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { return } addFromQueue(queue[i]) - // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) - // if isFull() { - // executeAndReset() - // } - // queue[i] = queue[qID-1] + // len(queue) < batchSize so no need to check for full batch. qID-- } } @@ -190,10 +189,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - // queue[qID].pointID = uint32(i << 1) queue[qID].point.Set(&points[i]) } else { - // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -209,7 +206,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() // TODO top queue only + processTopQueue() } } @@ -218,21 +215,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // empty the queue flushQueue() - // for qID != 0 { - // processQueue() - // executeAndReset() - // } // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g1JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if bucketSet[k] { - runningSum.addMixed(&buckets[k]) - } + runningSum.addMixed(&buckets[k]) if !bucketsJE[k].ZZ.IsZero() { runningSum.add(&bucketsJE[k]) } @@ -352,13 +342,7 @@ type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 - // pointID uint32 - point G2Affine -} - -func (o batchOpG2Affine) isNeg() bool { - return false - // return o.pointID&1 == 1 + point G2Affine } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -374,10 +358,23 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP points []G2Affine, digits []uint16) { - // init the buckets + // the batch affine addition needs independent points; in other words, for a window of batchSize + // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying + // to add 2 different points to the same bucket), then we push the conflicted point to a queue. + // each time the batch is full, we execute it, and tentatively put the points (if not conflict) + // from the top of the queue into the next batch. + // if the queue is full, we "flush it"; we sequentially add the points to the buckets in + // g2JacExtended coordinates. + // The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random + // input, the number of conflicts is going to be low, and the element added to the queue should be immediatly + // processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to + // non-batch-affine version. + + // note that we have 2 sets of buckets + // 1 in G2Affine used with the batch affine additions + // 1 in g2JacExtended used in case the queue of conflicting points var buckets B var bucketsJE BJE - var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -392,13 +389,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) - // var queue [batchSize]batchOpG2Affine batchSize := len(P) - isFull := func() bool { - return cptAdd == batchSize - } + isFull := func() bool { return cptAdd == batchSize } executeAndReset := func() { batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) @@ -406,12 +400,16 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG2Affine) { - // @precondition: ensures bucket is not "used" in current batch + // @precondition: must ensures bucket is not "used" in current batch + // note that there is a bit of duplicate logic between add and addFromQueue + // the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature) + // the compiler will put the queue on the heap. BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P - if !bucketSet[op.bucketID] { - bucketSet[op.bucketID] = true + if BK.IsInfinity() { BK.Set(&op.point) return } @@ -424,7 +422,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP return } BK.setInfinity() - bucketSet[op.bucketID] = false return } @@ -438,13 +435,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if !bucketSet[bucketID] { + if BK.IsInfinity() { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } - bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -456,14 +452,11 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() - bucketSet[bucketID] = false } - return } if isAdd { BK.setInfinity() - bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -487,17 +480,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP qID = 0 } - processQueue := func() { + processTopQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { return } addFromQueue(queue[i]) - // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) - // if isFull() { - // executeAndReset() - // } - // queue[i] = queue[qID-1] + // len(queue) < batchSize so no need to check for full batch. qID-- } } @@ -519,10 +508,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - // queue[qID].pointID = uint32(i << 1) queue[qID].point.Set(&points[i]) } else { - // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -538,7 +525,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() // TODO top queue only + processTopQueue() } } @@ -547,21 +534,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // empty the queue flushQueue() - // for qID != 0 { - // processQueue() - // executeAndReset() - // } // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g2JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if bucketSet[k] { - runningSum.addMixed(&buckets[k]) - } + runningSum.addMixed(&buckets[k]) if !bucketsJE[k].ZZ.IsZero() { runningSum.add(&bucketsJE[k]) } diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index 1e9ff1e4de..44c19874e4 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -669,11 +669,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index 992efc3fa4..b7af2292aa 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -23,13 +23,7 @@ import ( type batchOpG1Affine struct { bucketID uint16 - // pointID uint32 - point G1Affine -} - -func (o batchOpG1Affine) isNeg() bool { - return false - // return o.pointID&1 == 1 + point G1Affine } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -45,10 +39,23 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP points []G1Affine, digits []uint16) { - // init the buckets + // the batch affine addition needs independent points; in other words, for a window of batchSize + // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying + // to add 2 different points to the same bucket), then we push the conflicted point to a queue. + // each time the batch is full, we execute it, and tentatively put the points (if not conflict) + // from the top of the queue into the next batch. + // if the queue is full, we "flush it"; we sequentially add the points to the buckets in + // g1JacExtended coordinates. + // The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random + // input, the number of conflicts is going to be low, and the element added to the queue should be immediatly + // processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to + // non-batch-affine version. + + // note that we have 2 sets of buckets + // 1 in G1Affine used with the batch affine additions + // 1 in g1JacExtended used in case the queue of conflicting points var buckets B var bucketsJE BJE - var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -63,13 +70,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) - // var queue [batchSize]batchOpG1Affine batchSize := len(P) - isFull := func() bool { - return cptAdd == batchSize - } + isFull := func() bool { return cptAdd == batchSize } executeAndReset := func() { batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) @@ -77,12 +81,16 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG1Affine) { - // @precondition: ensures bucket is not "used" in current batch + // @precondition: must ensures bucket is not "used" in current batch + // note that there is a bit of duplicate logic between add and addFromQueue + // the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature) + // the compiler will put the queue on the heap. BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P - if !bucketSet[op.bucketID] { - bucketSet[op.bucketID] = true + if BK.IsInfinity() { BK.Set(&op.point) return } @@ -95,7 +103,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP return } BK.setInfinity() - bucketSet[op.bucketID] = false return } @@ -109,13 +116,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if !bucketSet[bucketID] { + if BK.IsInfinity() { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } - bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -127,14 +133,11 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() - bucketSet[bucketID] = false } - return } if isAdd { BK.setInfinity() - bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -158,17 +161,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP qID = 0 } - processQueue := func() { + processTopQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { return } addFromQueue(queue[i]) - // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) - // if isFull() { - // executeAndReset() - // } - // queue[i] = queue[qID-1] + // len(queue) < batchSize so no need to check for full batch. qID-- } } @@ -190,10 +189,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - // queue[qID].pointID = uint32(i << 1) queue[qID].point.Set(&points[i]) } else { - // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -209,7 +206,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() // TODO top queue only + processTopQueue() } } @@ -218,21 +215,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // empty the queue flushQueue() - // for qID != 0 { - // processQueue() - // executeAndReset() - // } // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g1JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if bucketSet[k] { - runningSum.addMixed(&buckets[k]) - } + runningSum.addMixed(&buckets[k]) if !bucketsJE[k].ZZ.IsZero() { runningSum.add(&bucketsJE[k]) } @@ -352,13 +342,7 @@ type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 - // pointID uint32 - point G2Affine -} - -func (o batchOpG2Affine) isNeg() bool { - return false - // return o.pointID&1 == 1 + point G2Affine } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -374,10 +358,23 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP points []G2Affine, digits []uint16) { - // init the buckets + // the batch affine addition needs independent points; in other words, for a window of batchSize + // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying + // to add 2 different points to the same bucket), then we push the conflicted point to a queue. + // each time the batch is full, we execute it, and tentatively put the points (if not conflict) + // from the top of the queue into the next batch. + // if the queue is full, we "flush it"; we sequentially add the points to the buckets in + // g2JacExtended coordinates. + // The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random + // input, the number of conflicts is going to be low, and the element added to the queue should be immediatly + // processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to + // non-batch-affine version. + + // note that we have 2 sets of buckets + // 1 in G2Affine used with the batch affine additions + // 1 in g2JacExtended used in case the queue of conflicting points var buckets B var bucketsJE BJE - var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -392,13 +389,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) - // var queue [batchSize]batchOpG2Affine batchSize := len(P) - isFull := func() bool { - return cptAdd == batchSize - } + isFull := func() bool { return cptAdd == batchSize } executeAndReset := func() { batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) @@ -406,12 +400,16 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG2Affine) { - // @precondition: ensures bucket is not "used" in current batch + // @precondition: must ensures bucket is not "used" in current batch + // note that there is a bit of duplicate logic between add and addFromQueue + // the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature) + // the compiler will put the queue on the heap. BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P - if !bucketSet[op.bucketID] { - bucketSet[op.bucketID] = true + if BK.IsInfinity() { BK.Set(&op.point) return } @@ -424,7 +422,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP return } BK.setInfinity() - bucketSet[op.bucketID] = false return } @@ -438,13 +435,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if !bucketSet[bucketID] { + if BK.IsInfinity() { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } - bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -456,14 +452,11 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() - bucketSet[bucketID] = false } - return } if isAdd { BK.setInfinity() - bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -487,17 +480,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP qID = 0 } - processQueue := func() { + processTopQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { return } addFromQueue(queue[i]) - // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) - // if isFull() { - // executeAndReset() - // } - // queue[i] = queue[qID-1] + // len(queue) < batchSize so no need to check for full batch. qID-- } } @@ -519,10 +508,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - // queue[qID].pointID = uint32(i << 1) queue[qID].point.Set(&points[i]) } else { - // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -538,7 +525,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() // TODO top queue only + processTopQueue() } } @@ -547,21 +534,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // empty the queue flushQueue() - // for qID != 0 { - // processQueue() - // executeAndReset() - // } // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g2JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if bucketSet[k] { - runningSum.addMixed(&buckets[k]) - } + runningSum.addMixed(&buckets[k]) if !bucketsJE[k].ZZ.IsZero() { runningSum.add(&bucketsJE[k]) } diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index bb1a3ac61e..946645ded0 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -669,11 +669,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index ac3db048e6..6ef411f4b5 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -23,13 +23,7 @@ import ( type batchOpG1Affine struct { bucketID uint16 - // pointID uint32 - point G1Affine -} - -func (o batchOpG1Affine) isNeg() bool { - return false - // return o.pointID&1 == 1 + point G1Affine } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -45,10 +39,23 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP points []G1Affine, digits []uint16) { - // init the buckets + // the batch affine addition needs independent points; in other words, for a window of batchSize + // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying + // to add 2 different points to the same bucket), then we push the conflicted point to a queue. + // each time the batch is full, we execute it, and tentatively put the points (if not conflict) + // from the top of the queue into the next batch. + // if the queue is full, we "flush it"; we sequentially add the points to the buckets in + // g1JacExtended coordinates. + // The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random + // input, the number of conflicts is going to be low, and the element added to the queue should be immediatly + // processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to + // non-batch-affine version. + + // note that we have 2 sets of buckets + // 1 in G1Affine used with the batch affine additions + // 1 in g1JacExtended used in case the queue of conflicting points var buckets B var bucketsJE BJE - var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -63,13 +70,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) - // var queue [batchSize]batchOpG1Affine batchSize := len(P) - isFull := func() bool { - return cptAdd == batchSize - } + isFull := func() bool { return cptAdd == batchSize } executeAndReset := func() { batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) @@ -77,12 +81,16 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG1Affine) { - // @precondition: ensures bucket is not "used" in current batch + // @precondition: must ensures bucket is not "used" in current batch + // note that there is a bit of duplicate logic between add and addFromQueue + // the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature) + // the compiler will put the queue on the heap. BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P - if !bucketSet[op.bucketID] { - bucketSet[op.bucketID] = true + if BK.IsInfinity() { BK.Set(&op.point) return } @@ -95,7 +103,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP return } BK.setInfinity() - bucketSet[op.bucketID] = false return } @@ -109,13 +116,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if !bucketSet[bucketID] { + if BK.IsInfinity() { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } - bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -127,14 +133,11 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() - bucketSet[bucketID] = false } - return } if isAdd { BK.setInfinity() - bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -158,17 +161,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP qID = 0 } - processQueue := func() { + processTopQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { return } addFromQueue(queue[i]) - // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) - // if isFull() { - // executeAndReset() - // } - // queue[i] = queue[qID-1] + // len(queue) < batchSize so no need to check for full batch. qID-- } } @@ -190,10 +189,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - // queue[qID].pointID = uint32(i << 1) queue[qID].point.Set(&points[i]) } else { - // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -209,7 +206,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() // TODO top queue only + processTopQueue() } } @@ -218,21 +215,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // empty the queue flushQueue() - // for qID != 0 { - // processQueue() - // executeAndReset() - // } // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g1JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if bucketSet[k] { - runningSum.addMixed(&buckets[k]) - } + runningSum.addMixed(&buckets[k]) if !bucketsJE[k].ZZ.IsZero() { runningSum.add(&bucketsJE[k]) } @@ -352,13 +342,7 @@ type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 - // pointID uint32 - point G2Affine -} - -func (o batchOpG2Affine) isNeg() bool { - return false - // return o.pointID&1 == 1 + point G2Affine } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -374,10 +358,23 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP points []G2Affine, digits []uint16) { - // init the buckets + // the batch affine addition needs independent points; in other words, for a window of batchSize + // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying + // to add 2 different points to the same bucket), then we push the conflicted point to a queue. + // each time the batch is full, we execute it, and tentatively put the points (if not conflict) + // from the top of the queue into the next batch. + // if the queue is full, we "flush it"; we sequentially add the points to the buckets in + // g2JacExtended coordinates. + // The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random + // input, the number of conflicts is going to be low, and the element added to the queue should be immediatly + // processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to + // non-batch-affine version. + + // note that we have 2 sets of buckets + // 1 in G2Affine used with the batch affine additions + // 1 in g2JacExtended used in case the queue of conflicting points var buckets B var bucketsJE BJE - var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -392,13 +389,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) - // var queue [batchSize]batchOpG2Affine batchSize := len(P) - isFull := func() bool { - return cptAdd == batchSize - } + isFull := func() bool { return cptAdd == batchSize } executeAndReset := func() { batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) @@ -406,12 +400,16 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG2Affine) { - // @precondition: ensures bucket is not "used" in current batch + // @precondition: must ensures bucket is not "used" in current batch + // note that there is a bit of duplicate logic between add and addFromQueue + // the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature) + // the compiler will put the queue on the heap. BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P - if !bucketSet[op.bucketID] { - bucketSet[op.bucketID] = true + if BK.IsInfinity() { BK.Set(&op.point) return } @@ -424,7 +422,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP return } BK.setInfinity() - bucketSet[op.bucketID] = false return } @@ -438,13 +435,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if !bucketSet[bucketID] { + if BK.IsInfinity() { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } - bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -456,14 +452,11 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() - bucketSet[bucketID] = false } - return } if isAdd { BK.setInfinity() - bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -487,17 +480,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP qID = 0 } - processQueue := func() { + processTopQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { return } addFromQueue(queue[i]) - // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) - // if isFull() { - // executeAndReset() - // } - // queue[i] = queue[qID-1] + // len(queue) < batchSize so no need to check for full batch. qID-- } } @@ -519,10 +508,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - // queue[qID].pointID = uint32(i << 1) queue[qID].point.Set(&points[i]) } else { - // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -538,7 +525,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() // TODO top queue only + processTopQueue() } } @@ -547,21 +534,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // empty the queue flushQueue() - // for qID != 0 { - // processQueue() - // executeAndReset() - // } // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g2JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if bucketSet[k] { - runningSum.addMixed(&buckets[k]) - } + runningSum.addMixed(&buckets[k]) if !bucketsJE[k].ZZ.IsZero() { runningSum.add(&bucketsJE[k]) } diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index f44d8d7b81..27efcb0f13 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -669,11 +669,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index 4e241f79a5..a442f743ec 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -23,13 +23,7 @@ import ( type batchOpG1Affine struct { bucketID uint16 - // pointID uint32 - point G1Affine -} - -func (o batchOpG1Affine) isNeg() bool { - return false - // return o.pointID&1 == 1 + point G1Affine } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -45,10 +39,23 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP points []G1Affine, digits []uint16) { - // init the buckets + // the batch affine addition needs independent points; in other words, for a window of batchSize + // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying + // to add 2 different points to the same bucket), then we push the conflicted point to a queue. + // each time the batch is full, we execute it, and tentatively put the points (if not conflict) + // from the top of the queue into the next batch. + // if the queue is full, we "flush it"; we sequentially add the points to the buckets in + // g1JacExtended coordinates. + // The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random + // input, the number of conflicts is going to be low, and the element added to the queue should be immediatly + // processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to + // non-batch-affine version. + + // note that we have 2 sets of buckets + // 1 in G1Affine used with the batch affine additions + // 1 in g1JacExtended used in case the queue of conflicting points var buckets B var bucketsJE BJE - var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -63,13 +70,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) - // var queue [batchSize]batchOpG1Affine batchSize := len(P) - isFull := func() bool { - return cptAdd == batchSize - } + isFull := func() bool { return cptAdd == batchSize } executeAndReset := func() { batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) @@ -77,12 +81,16 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG1Affine) { - // @precondition: ensures bucket is not "used" in current batch + // @precondition: must ensures bucket is not "used" in current batch + // note that there is a bit of duplicate logic between add and addFromQueue + // the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature) + // the compiler will put the queue on the heap. BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P - if !bucketSet[op.bucketID] { - bucketSet[op.bucketID] = true + if BK.IsInfinity() { BK.Set(&op.point) return } @@ -95,7 +103,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP return } BK.setInfinity() - bucketSet[op.bucketID] = false return } @@ -109,13 +116,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if !bucketSet[bucketID] { + if BK.IsInfinity() { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } - bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -127,14 +133,11 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() - bucketSet[bucketID] = false } - return } if isAdd { BK.setInfinity() - bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -158,17 +161,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP qID = 0 } - processQueue := func() { + processTopQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { return } addFromQueue(queue[i]) - // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) - // if isFull() { - // executeAndReset() - // } - // queue[i] = queue[qID-1] + // len(queue) < batchSize so no need to check for full batch. qID-- } } @@ -190,10 +189,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - // queue[qID].pointID = uint32(i << 1) queue[qID].point.Set(&points[i]) } else { - // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -209,7 +206,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() // TODO top queue only + processTopQueue() } } @@ -218,21 +215,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // empty the queue flushQueue() - // for qID != 0 { - // processQueue() - // executeAndReset() - // } // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g1JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if bucketSet[k] { - runningSum.addMixed(&buckets[k]) - } + runningSum.addMixed(&buckets[k]) if !bucketsJE[k].ZZ.IsZero() { runningSum.add(&bucketsJE[k]) } @@ -352,13 +342,7 @@ type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 - // pointID uint32 - point G2Affine -} - -func (o batchOpG2Affine) isNeg() bool { - return false - // return o.pointID&1 == 1 + point G2Affine } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -374,10 +358,23 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP points []G2Affine, digits []uint16) { - // init the buckets + // the batch affine addition needs independent points; in other words, for a window of batchSize + // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying + // to add 2 different points to the same bucket), then we push the conflicted point to a queue. + // each time the batch is full, we execute it, and tentatively put the points (if not conflict) + // from the top of the queue into the next batch. + // if the queue is full, we "flush it"; we sequentially add the points to the buckets in + // g2JacExtended coordinates. + // The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random + // input, the number of conflicts is going to be low, and the element added to the queue should be immediatly + // processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to + // non-batch-affine version. + + // note that we have 2 sets of buckets + // 1 in G2Affine used with the batch affine additions + // 1 in g2JacExtended used in case the queue of conflicting points var buckets B var bucketsJE BJE - var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -392,13 +389,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) - // var queue [batchSize]batchOpG2Affine batchSize := len(P) - isFull := func() bool { - return cptAdd == batchSize - } + isFull := func() bool { return cptAdd == batchSize } executeAndReset := func() { batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) @@ -406,12 +400,16 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG2Affine) { - // @precondition: ensures bucket is not "used" in current batch + // @precondition: must ensures bucket is not "used" in current batch + // note that there is a bit of duplicate logic between add and addFromQueue + // the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature) + // the compiler will put the queue on the heap. BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P - if !bucketSet[op.bucketID] { - bucketSet[op.bucketID] = true + if BK.IsInfinity() { BK.Set(&op.point) return } @@ -424,7 +422,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP return } BK.setInfinity() - bucketSet[op.bucketID] = false return } @@ -438,13 +435,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if !bucketSet[bucketID] { + if BK.IsInfinity() { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } - bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -456,14 +452,11 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() - bucketSet[bucketID] = false } - return } if isAdd { BK.setInfinity() - bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -487,17 +480,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP qID = 0 } - processQueue := func() { + processTopQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { return } addFromQueue(queue[i]) - // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) - // if isFull() { - // executeAndReset() - // } - // queue[i] = queue[qID-1] + // len(queue) < batchSize so no need to check for full batch. qID-- } } @@ -519,10 +508,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - // queue[qID].pointID = uint32(i << 1) queue[qID].point.Set(&points[i]) } else { - // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -538,7 +525,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() // TODO top queue only + processTopQueue() } } @@ -547,21 +534,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // empty the queue flushQueue() - // for qID != 0 { - // processQueue() - // executeAndReset() - // } // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g2JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if bucketSet[k] { - runningSum.addMixed(&buckets[k]) - } + runningSum.addMixed(&buckets[k]) if !bucketsJE[k].ZZ.IsZero() { runningSum.add(&bucketsJE[k]) } diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index b3faa9e76f..95650ab5ca 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -669,11 +669,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index a268d19c87..447d11c42c 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -23,13 +23,7 @@ import ( type batchOpG1Affine struct { bucketID uint16 - // pointID uint32 - point G1Affine -} - -func (o batchOpG1Affine) isNeg() bool { - return false - // return o.pointID&1 == 1 + point G1Affine } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -45,10 +39,23 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP points []G1Affine, digits []uint16) { - // init the buckets + // the batch affine addition needs independent points; in other words, for a window of batchSize + // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying + // to add 2 different points to the same bucket), then we push the conflicted point to a queue. + // each time the batch is full, we execute it, and tentatively put the points (if not conflict) + // from the top of the queue into the next batch. + // if the queue is full, we "flush it"; we sequentially add the points to the buckets in + // g1JacExtended coordinates. + // The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random + // input, the number of conflicts is going to be low, and the element added to the queue should be immediatly + // processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to + // non-batch-affine version. + + // note that we have 2 sets of buckets + // 1 in G1Affine used with the batch affine additions + // 1 in g1JacExtended used in case the queue of conflicting points var buckets B var bucketsJE BJE - var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -63,13 +70,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) - // var queue [batchSize]batchOpG1Affine batchSize := len(P) - isFull := func() bool { - return cptAdd == batchSize - } + isFull := func() bool { return cptAdd == batchSize } executeAndReset := func() { batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) @@ -77,12 +81,16 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG1Affine) { - // @precondition: ensures bucket is not "used" in current batch + // @precondition: must ensures bucket is not "used" in current batch + // note that there is a bit of duplicate logic between add and addFromQueue + // the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature) + // the compiler will put the queue on the heap. BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P - if !bucketSet[op.bucketID] { - bucketSet[op.bucketID] = true + if BK.IsInfinity() { BK.Set(&op.point) return } @@ -95,7 +103,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP return } BK.setInfinity() - bucketSet[op.bucketID] = false return } @@ -109,13 +116,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if !bucketSet[bucketID] { + if BK.IsInfinity() { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } - bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -127,14 +133,11 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() - bucketSet[bucketID] = false } - return } if isAdd { BK.setInfinity() - bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -158,17 +161,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP qID = 0 } - processQueue := func() { + processTopQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { return } addFromQueue(queue[i]) - // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) - // if isFull() { - // executeAndReset() - // } - // queue[i] = queue[qID-1] + // len(queue) < batchSize so no need to check for full batch. qID-- } } @@ -190,10 +189,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - // queue[qID].pointID = uint32(i << 1) queue[qID].point.Set(&points[i]) } else { - // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -209,7 +206,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() // TODO top queue only + processTopQueue() } } @@ -218,21 +215,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // empty the queue flushQueue() - // for qID != 0 { - // processQueue() - // executeAndReset() - // } // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g1JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if bucketSet[k] { - runningSum.addMixed(&buckets[k]) - } + runningSum.addMixed(&buckets[k]) if !bucketsJE[k].ZZ.IsZero() { runningSum.add(&bucketsJE[k]) } @@ -352,13 +342,7 @@ type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 - // pointID uint32 - point G2Affine -} - -func (o batchOpG2Affine) isNeg() bool { - return false - // return o.pointID&1 == 1 + point G2Affine } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -374,10 +358,23 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP points []G2Affine, digits []uint16) { - // init the buckets + // the batch affine addition needs independent points; in other words, for a window of batchSize + // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying + // to add 2 different points to the same bucket), then we push the conflicted point to a queue. + // each time the batch is full, we execute it, and tentatively put the points (if not conflict) + // from the top of the queue into the next batch. + // if the queue is full, we "flush it"; we sequentially add the points to the buckets in + // g2JacExtended coordinates. + // The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random + // input, the number of conflicts is going to be low, and the element added to the queue should be immediatly + // processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to + // non-batch-affine version. + + // note that we have 2 sets of buckets + // 1 in G2Affine used with the batch affine additions + // 1 in g2JacExtended used in case the queue of conflicting points var buckets B var bucketsJE BJE - var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -392,13 +389,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) - // var queue [batchSize]batchOpG2Affine batchSize := len(P) - isFull := func() bool { - return cptAdd == batchSize - } + isFull := func() bool { return cptAdd == batchSize } executeAndReset := func() { batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) @@ -406,12 +400,16 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG2Affine) { - // @precondition: ensures bucket is not "used" in current batch + // @precondition: must ensures bucket is not "used" in current batch + // note that there is a bit of duplicate logic between add and addFromQueue + // the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature) + // the compiler will put the queue on the heap. BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P - if !bucketSet[op.bucketID] { - bucketSet[op.bucketID] = true + if BK.IsInfinity() { BK.Set(&op.point) return } @@ -424,7 +422,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP return } BK.setInfinity() - bucketSet[op.bucketID] = false return } @@ -438,13 +435,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if !bucketSet[bucketID] { + if BK.IsInfinity() { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } - bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -456,14 +452,11 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() - bucketSet[bucketID] = false } - return } if isAdd { BK.setInfinity() - bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -487,17 +480,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP qID = 0 } - processQueue := func() { + processTopQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { return } addFromQueue(queue[i]) - // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) - // if isFull() { - // executeAndReset() - // } - // queue[i] = queue[qID-1] + // len(queue) < batchSize so no need to check for full batch. qID-- } } @@ -519,10 +508,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - // queue[qID].pointID = uint32(i << 1) queue[qID].point.Set(&points[i]) } else { - // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -538,7 +525,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() // TODO top queue only + processTopQueue() } } @@ -547,21 +534,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // empty the queue flushQueue() - // for qID != 0 { - // processQueue() - // executeAndReset() - // } // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g2JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if bucketSet[k] { - runningSum.addMixed(&buckets[k]) - } + runningSum.addMixed(&buckets[k]) if !bucketsJE[k].ZZ.IsZero() { runningSum.add(&bucketsJE[k]) } diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index c299f039b8..68e7b17e41 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -669,11 +669,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index a10b6cbf76..493d750496 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -22,13 +22,7 @@ import ( type batchOpG1Affine struct { bucketID uint16 - // pointID uint32 - point G1Affine -} - -func (o batchOpG1Affine) isNeg() bool { - return false - // return o.pointID&1 == 1 + point G1Affine } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -44,10 +38,23 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP points []G1Affine, digits []uint16) { - // init the buckets + // the batch affine addition needs independent points; in other words, for a window of batchSize + // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying + // to add 2 different points to the same bucket), then we push the conflicted point to a queue. + // each time the batch is full, we execute it, and tentatively put the points (if not conflict) + // from the top of the queue into the next batch. + // if the queue is full, we "flush it"; we sequentially add the points to the buckets in + // g1JacExtended coordinates. + // The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random + // input, the number of conflicts is going to be low, and the element added to the queue should be immediatly + // processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to + // non-batch-affine version. + + // note that we have 2 sets of buckets + // 1 in G1Affine used with the batch affine additions + // 1 in g1JacExtended used in case the queue of conflicting points var buckets B var bucketsJE BJE - var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -62,13 +69,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) - // var queue [batchSize]batchOpG1Affine batchSize := len(P) - isFull := func() bool { - return cptAdd == batchSize - } + isFull := func() bool { return cptAdd == batchSize } executeAndReset := func() { batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) @@ -76,12 +80,16 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG1Affine) { - // @precondition: ensures bucket is not "used" in current batch + // @precondition: must ensures bucket is not "used" in current batch + // note that there is a bit of duplicate logic between add and addFromQueue + // the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature) + // the compiler will put the queue on the heap. BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P - if !bucketSet[op.bucketID] { - bucketSet[op.bucketID] = true + if BK.IsInfinity() { BK.Set(&op.point) return } @@ -94,7 +102,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP return } BK.setInfinity() - bucketSet[op.bucketID] = false return } @@ -108,13 +115,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if !bucketSet[bucketID] { + if BK.IsInfinity() { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } - bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -126,14 +132,11 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() - bucketSet[bucketID] = false } - return } if isAdd { BK.setInfinity() - bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -157,17 +160,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP qID = 0 } - processQueue := func() { + processTopQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { return } addFromQueue(queue[i]) - // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) - // if isFull() { - // executeAndReset() - // } - // queue[i] = queue[qID-1] + // len(queue) < batchSize so no need to check for full batch. qID-- } } @@ -189,10 +188,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - // queue[qID].pointID = uint32(i << 1) queue[qID].point.Set(&points[i]) } else { - // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -208,7 +205,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() // TODO top queue only + processTopQueue() } } @@ -217,21 +214,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // empty the queue flushQueue() - // for qID != 0 { - // processQueue() - // executeAndReset() - // } // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g1JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if bucketSet[k] { - runningSum.addMixed(&buckets[k]) - } + runningSum.addMixed(&buckets[k]) if !bucketsJE[k].ZZ.IsZero() { runningSum.add(&bucketsJE[k]) } @@ -279,13 +269,7 @@ type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 - // pointID uint32 - point G2Affine -} - -func (o batchOpG2Affine) isNeg() bool { - return false - // return o.pointID&1 == 1 + point G2Affine } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -301,10 +285,23 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP points []G2Affine, digits []uint16) { - // init the buckets + // the batch affine addition needs independent points; in other words, for a window of batchSize + // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying + // to add 2 different points to the same bucket), then we push the conflicted point to a queue. + // each time the batch is full, we execute it, and tentatively put the points (if not conflict) + // from the top of the queue into the next batch. + // if the queue is full, we "flush it"; we sequentially add the points to the buckets in + // g2JacExtended coordinates. + // The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random + // input, the number of conflicts is going to be low, and the element added to the queue should be immediatly + // processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to + // non-batch-affine version. + + // note that we have 2 sets of buckets + // 1 in G2Affine used with the batch affine additions + // 1 in g2JacExtended used in case the queue of conflicting points var buckets B var bucketsJE BJE - var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -319,13 +316,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) - // var queue [batchSize]batchOpG2Affine batchSize := len(P) - isFull := func() bool { - return cptAdd == batchSize - } + isFull := func() bool { return cptAdd == batchSize } executeAndReset := func() { batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) @@ -333,12 +327,16 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG2Affine) { - // @precondition: ensures bucket is not "used" in current batch + // @precondition: must ensures bucket is not "used" in current batch + // note that there is a bit of duplicate logic between add and addFromQueue + // the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature) + // the compiler will put the queue on the heap. BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P - if !bucketSet[op.bucketID] { - bucketSet[op.bucketID] = true + if BK.IsInfinity() { BK.Set(&op.point) return } @@ -351,7 +349,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP return } BK.setInfinity() - bucketSet[op.bucketID] = false return } @@ -365,13 +362,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if !bucketSet[bucketID] { + if BK.IsInfinity() { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } - bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -383,14 +379,11 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() - bucketSet[bucketID] = false } - return } if isAdd { BK.setInfinity() - bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -414,17 +407,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP qID = 0 } - processQueue := func() { + processTopQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { return } addFromQueue(queue[i]) - // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) - // if isFull() { - // executeAndReset() - // } - // queue[i] = queue[qID-1] + // len(queue) < batchSize so no need to check for full batch. qID-- } } @@ -446,10 +435,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - // queue[qID].pointID = uint32(i << 1) queue[qID].point.Set(&points[i]) } else { - // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -465,7 +452,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() // TODO top queue only + processTopQueue() } } @@ -474,21 +461,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // empty the queue flushQueue() - // for qID != 0 { - // processQueue() - // executeAndReset() - // } // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g2JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if bucketSet[k] { - runningSum.addMixed(&buckets[k]) - } + runningSum.addMixed(&buckets[k]) if !bucketsJE[k].ZZ.IsZero() { runningSum.add(&bucketsJE[k]) } diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index bee27d7123..b059cbc98f 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -669,11 +669,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index 00e9053fd1..003b4678bd 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -22,13 +22,7 @@ import ( type batchOpG1Affine struct { bucketID uint16 - // pointID uint32 - point G1Affine -} - -func (o batchOpG1Affine) isNeg() bool { - return false - // return o.pointID&1 == 1 + point G1Affine } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -44,10 +38,23 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP points []G1Affine, digits []uint16) { - // init the buckets + // the batch affine addition needs independent points; in other words, for a window of batchSize + // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying + // to add 2 different points to the same bucket), then we push the conflicted point to a queue. + // each time the batch is full, we execute it, and tentatively put the points (if not conflict) + // from the top of the queue into the next batch. + // if the queue is full, we "flush it"; we sequentially add the points to the buckets in + // g1JacExtended coordinates. + // The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random + // input, the number of conflicts is going to be low, and the element added to the queue should be immediatly + // processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to + // non-batch-affine version. + + // note that we have 2 sets of buckets + // 1 in G1Affine used with the batch affine additions + // 1 in g1JacExtended used in case the queue of conflicting points var buckets B var bucketsJE BJE - var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -62,13 +69,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) - // var queue [batchSize]batchOpG1Affine batchSize := len(P) - isFull := func() bool { - return cptAdd == batchSize - } + isFull := func() bool { return cptAdd == batchSize } executeAndReset := func() { batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) @@ -76,12 +80,16 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG1Affine) { - // @precondition: ensures bucket is not "used" in current batch + // @precondition: must ensures bucket is not "used" in current batch + // note that there is a bit of duplicate logic between add and addFromQueue + // the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature) + // the compiler will put the queue on the heap. BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P - if !bucketSet[op.bucketID] { - bucketSet[op.bucketID] = true + if BK.IsInfinity() { BK.Set(&op.point) return } @@ -94,7 +102,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP return } BK.setInfinity() - bucketSet[op.bucketID] = false return } @@ -108,13 +115,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if !bucketSet[bucketID] { + if BK.IsInfinity() { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } - bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -126,14 +132,11 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() - bucketSet[bucketID] = false } - return } if isAdd { BK.setInfinity() - bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -157,17 +160,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP qID = 0 } - processQueue := func() { + processTopQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { return } addFromQueue(queue[i]) - // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) - // if isFull() { - // executeAndReset() - // } - // queue[i] = queue[qID-1] + // len(queue) < batchSize so no need to check for full batch. qID-- } } @@ -189,10 +188,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - // queue[qID].pointID = uint32(i << 1) queue[qID].point.Set(&points[i]) } else { - // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -208,7 +205,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() // TODO top queue only + processTopQueue() } } @@ -217,21 +214,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // empty the queue flushQueue() - // for qID != 0 { - // processQueue() - // executeAndReset() - // } // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g1JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if bucketSet[k] { - runningSum.addMixed(&buckets[k]) - } + runningSum.addMixed(&buckets[k]) if !bucketsJE[k].ZZ.IsZero() { runningSum.add(&bucketsJE[k]) } @@ -279,13 +269,7 @@ type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 - // pointID uint32 - point G2Affine -} - -func (o batchOpG2Affine) isNeg() bool { - return false - // return o.pointID&1 == 1 + point G2Affine } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -301,10 +285,23 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP points []G2Affine, digits []uint16) { - // init the buckets + // the batch affine addition needs independent points; in other words, for a window of batchSize + // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying + // to add 2 different points to the same bucket), then we push the conflicted point to a queue. + // each time the batch is full, we execute it, and tentatively put the points (if not conflict) + // from the top of the queue into the next batch. + // if the queue is full, we "flush it"; we sequentially add the points to the buckets in + // g2JacExtended coordinates. + // The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random + // input, the number of conflicts is going to be low, and the element added to the queue should be immediatly + // processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to + // non-batch-affine version. + + // note that we have 2 sets of buckets + // 1 in G2Affine used with the batch affine additions + // 1 in g2JacExtended used in case the queue of conflicting points var buckets B var bucketsJE BJE - var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -319,13 +316,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) - // var queue [batchSize]batchOpG2Affine batchSize := len(P) - isFull := func() bool { - return cptAdd == batchSize - } + isFull := func() bool { return cptAdd == batchSize } executeAndReset := func() { batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) @@ -333,12 +327,16 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG2Affine) { - // @precondition: ensures bucket is not "used" in current batch + // @precondition: must ensures bucket is not "used" in current batch + // note that there is a bit of duplicate logic between add and addFromQueue + // the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature) + // the compiler will put the queue on the heap. BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P - if !bucketSet[op.bucketID] { - bucketSet[op.bucketID] = true + if BK.IsInfinity() { BK.Set(&op.point) return } @@ -351,7 +349,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP return } BK.setInfinity() - bucketSet[op.bucketID] = false return } @@ -365,13 +362,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if !bucketSet[bucketID] { + if BK.IsInfinity() { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } - bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -383,14 +379,11 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() - bucketSet[bucketID] = false } - return } if isAdd { BK.setInfinity() - bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -414,17 +407,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP qID = 0 } - processQueue := func() { + processTopQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { return } addFromQueue(queue[i]) - // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) - // if isFull() { - // executeAndReset() - // } - // queue[i] = queue[qID-1] + // len(queue) < batchSize so no need to check for full batch. qID-- } } @@ -446,10 +435,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - // queue[qID].pointID = uint32(i << 1) queue[qID].point.Set(&points[i]) } else { - // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -465,7 +452,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() // TODO top queue only + processTopQueue() } } @@ -474,21 +461,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // empty the queue flushQueue() - // for qID != 0 { - // processQueue() - // executeAndReset() - // } // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g2JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if bucketSet[k] { - runningSum.addMixed(&buckets[k]) - } + runningSum.addMixed(&buckets[k]) if !bucketsJE[k].ZZ.IsZero() { runningSum.add(&bucketsJE[k]) } diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index 4fce6462a5..43529e1dbd 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -669,11 +669,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index 13f133daa5..c85ea75c3f 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -22,13 +22,7 @@ import ( type batchOpG1Affine struct { bucketID uint16 - // pointID uint32 - point G1Affine -} - -func (o batchOpG1Affine) isNeg() bool { - return false - // return o.pointID&1 == 1 + point G1Affine } // processChunkG1BatchAffine process a chunk of the scalars during the msm @@ -44,10 +38,23 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP points []G1Affine, digits []uint16) { - // init the buckets + // the batch affine addition needs independent points; in other words, for a window of batchSize + // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying + // to add 2 different points to the same bucket), then we push the conflicted point to a queue. + // each time the batch is full, we execute it, and tentatively put the points (if not conflict) + // from the top of the queue into the next batch. + // if the queue is full, we "flush it"; we sequentially add the points to the buckets in + // g1JacExtended coordinates. + // The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random + // input, the number of conflicts is going to be low, and the element added to the queue should be immediatly + // processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to + // non-batch-affine version. + + // note that we have 2 sets of buckets + // 1 in G1Affine used with the batch affine additions + // 1 in g1JacExtended used in case the queue of conflicting points var buckets B var bucketsJE BJE - var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -62,13 +69,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) - // var queue [batchSize]batchOpG1Affine batchSize := len(P) - isFull := func() bool { - return cptAdd == batchSize - } + isFull := func() bool { return cptAdd == batchSize } executeAndReset := func() { batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd) @@ -76,12 +80,16 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG1Affine) { - // @precondition: ensures bucket is not "used" in current batch + // @precondition: must ensures bucket is not "used" in current batch + // note that there is a bit of duplicate logic between add and addFromQueue + // the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature) + // the compiler will put the queue on the heap. BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P - if !bucketSet[op.bucketID] { - bucketSet[op.bucketID] = true + if BK.IsInfinity() { BK.Set(&op.point) return } @@ -94,7 +102,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP return } BK.setInfinity() - bucketSet[op.bucketID] = false return } @@ -108,13 +115,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if !bucketSet[bucketID] { + if BK.IsInfinity() { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } - bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -126,14 +132,11 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() - bucketSet[bucketID] = false } - return } if isAdd { BK.setInfinity() - bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -157,17 +160,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP qID = 0 } - processQueue := func() { + processTopQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { return } addFromQueue(queue[i]) - // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) - // if isFull() { - // executeAndReset() - // } - // queue[i] = queue[qID-1] + // len(queue) < batchSize so no need to check for full batch. qID-- } } @@ -189,10 +188,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - // queue[qID].pointID = uint32(i << 1) queue[qID].point.Set(&points[i]) } else { - // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -208,7 +205,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() // TODO top queue only + processTopQueue() } } @@ -217,21 +214,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // empty the queue flushQueue() - // for qID != 0 { - // processQueue() - // executeAndReset() - // } // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g1JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if bucketSet[k] { - runningSum.addMixed(&buckets[k]) - } + runningSum.addMixed(&buckets[k]) if !bucketsJE[k].ZZ.IsZero() { runningSum.add(&bucketsJE[k]) } @@ -279,13 +269,7 @@ type qG1AffineC16 [640]batchOpG1Affine type batchOpG2Affine struct { bucketID uint16 - // pointID uint32 - point G2Affine -} - -func (o batchOpG2Affine) isNeg() bool { - return false - // return o.pointID&1 == 1 + point G2Affine } // processChunkG2BatchAffine process a chunk of the scalars during the msm @@ -301,10 +285,23 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP points []G2Affine, digits []uint16) { - // init the buckets + // the batch affine addition needs independent points; in other words, for a window of batchSize + // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying + // to add 2 different points to the same bucket), then we push the conflicted point to a queue. + // each time the batch is full, we execute it, and tentatively put the points (if not conflict) + // from the top of the queue into the next batch. + // if the queue is full, we "flush it"; we sequentially add the points to the buckets in + // g2JacExtended coordinates. + // The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random + // input, the number of conflicts is going to be low, and the element added to the queue should be immediatly + // processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to + // non-batch-affine version. + + // note that we have 2 sets of buckets + // 1 in G2Affine used with the batch affine additions + // 1 in g2JacExtended used in case the queue of conflicting points var buckets B var bucketsJE BJE - var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -319,13 +316,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP queue TQ // queue of points that conflict the current batch qID int // current position in queue ) - // var queue [batchSize]batchOpG2Affine batchSize := len(P) - isFull := func() bool { - return cptAdd == batchSize - } + isFull := func() bool { return cptAdd == batchSize } executeAndReset := func() { batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd) @@ -333,12 +327,16 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOpG2Affine) { - // @precondition: ensures bucket is not "used" in current batch + // @precondition: must ensures bucket is not "used" in current batch + // note that there is a bit of duplicate logic between add and addFromQueue + // the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature) + // the compiler will put the queue on the heap. BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P - if !bucketSet[op.bucketID] { - bucketSet[op.bucketID] = true + if BK.IsInfinity() { BK.Set(&op.point) return } @@ -351,7 +349,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP return } BK.setInfinity() - bucketSet[op.bucketID] = false return } @@ -365,13 +362,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if !bucketSet[bucketID] { + if BK.IsInfinity() { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } - bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -383,14 +379,11 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP BK.Add(BK, BK) } else { BK.setInfinity() - bucketSet[bucketID] = false } - return } if isAdd { BK.setInfinity() - bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -414,17 +407,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP qID = 0 } - processQueue := func() { + processTopQueue := func() { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { return } addFromQueue(queue[i]) - // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) - // if isFull() { - // executeAndReset() - // } - // queue[i] = queue[qID-1] + // len(queue) < batchSize so no need to check for full batch. qID-- } } @@ -446,10 +435,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // put it in queue queue[qID].bucketID = bucketID if isAdd { - // queue[qID].pointID = uint32(i << 1) queue[qID].point.Set(&points[i]) } else { - // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -465,7 +452,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() // TODO top queue only + processTopQueue() } } @@ -474,21 +461,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // empty the queue flushQueue() - // for qID != 0 { - // processQueue() - // executeAndReset() - // } // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total g2JacExtended runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if bucketSet[k] { - runningSum.addMixed(&buckets[k]) - } + runningSum.addMixed(&buckets[k]) if !bucketsJE[k].ZZ.IsZero() { runningSum.add(&bucketsJE[k]) } diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index d21931beca..3fed44438a 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -669,11 +669,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index 6463bc1ede..72acde2cd5 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -14,29 +14,17 @@ import ( {{- end}} ) - - - - - {{ template "multiexp" dict "CoordType" .G1.CoordType "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange "LastCRange" .G1.LastCRange}} {{ template "multiexp" dict "CoordType" .G2.CoordType "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange "LastCRange" .G2.LastCRange}} - {{define "multiexp" }} type batchOp{{ $.TAffine }} struct { bucketID uint16 - // pointID uint32 point {{ $.TAffine }} } -func (o batchOp{{ $.TAffine }}) isNeg() bool { - return false - // return o.pointID&1 == 1 -} - // processChunk{{ $.UPointName }}BatchAffine process a chunk of the scalars during the msm // using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition // we use a batch affine addition. @@ -50,10 +38,23 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B points []{{ $.TAffine }}, digits []uint16) { - // init the buckets + // the batch affine addition needs independent points; in other words, for a window of batchSize + // we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying + // to add 2 different points to the same bucket), then we push the conflicted point to a queue. + // each time the batch is full, we execute it, and tentatively put the points (if not conflict) + // from the top of the queue into the next batch. + // if the queue is full, we "flush it"; we sequentially add the points to the buckets in + // {{ $.TJacobianExtended }} coordinates. + // The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random + // input, the number of conflicts is going to be low, and the element added to the queue should be immediatly + // processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to + // non-batch-affine version. + + // note that we have 2 sets of buckets + // 1 in {{ $.TAffine }} used with the batch affine additions + // 1 in {{ $.TJacobianExtended }} used in case the queue of conflicting points var buckets B var bucketsJE BJE - var bucketSet BS for i := 0; i < len(buckets); i++ { buckets[i].setInfinity() bucketsJE[i].setInfinity() @@ -68,14 +69,10 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B queue TQ // queue of points that conflict the current batch qID int // current position in queue ) - // var queue [batchSize]batchOp{{ $.TAffine}} batchSize := len(P) - - isFull := func() bool { - return cptAdd == batchSize - } + isFull := func() bool { return cptAdd == batchSize} executeAndReset := func () { batchAdd{{ $.TAffine }}[TP, TPP, TC](&R, &P, cptAdd) @@ -83,12 +80,16 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B bucketIds = tmp cptAdd = 0 } + addFromQueue := func(op batchOp{{ $.TAffine }}) { - // @precondition: ensures bucket is not "used" in current batch + // @precondition: must ensures bucket is not "used" in current batch + // note that there is a bit of duplicate logic between add and addFromQueue + // the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature) + // the compiler will put the queue on the heap. BK := &buckets[op.bucketID] + // handle special cases with inf or -P / P - if !bucketSet[op.bucketID] { - bucketSet[op.bucketID] = true + if BK.IsInfinity() { BK.Set(&op.point) return } @@ -101,7 +102,6 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B return } BK.setInfinity() - bucketSet[op.bucketID] = false return } @@ -115,13 +115,12 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B // @precondition: ensures bucket is not "used" in current batch BK := &buckets[bucketID] // handle special cases with inf or -P / P - if !bucketSet[bucketID] { + if BK.IsInfinity() { if isAdd { BK.Set(PP) } else { BK.Neg(PP) } - bucketSet[bucketID] = true return } if BK.X.Equal(&PP.X) { @@ -133,14 +132,11 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B BK.Add(BK, BK) } else { BK.setInfinity() - bucketSet[bucketID] = false } - return } if isAdd { BK.setInfinity() - bucketSet[bucketID] = false } else { BK.Add(BK, BK) } @@ -164,17 +160,13 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B qID = 0 } - processQueue := func () { + processTopQueue := func () { for i := qID - 1; i >= 0; i-- { if bucketIds[queue[i].bucketID] { return } - addFromQueue(queue[i]) - // add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg()) - // if isFull() { - // executeAndReset() - // } - // queue[i] = queue[qID-1] + addFromQueue(queue[i]) + // len(queue) < batchSize so no need to check for full batch. qID-- } } @@ -197,10 +189,8 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B // put it in queue queue[qID].bucketID = bucketID if isAdd { - // queue[qID].pointID = uint32(i << 1) queue[qID].point.Set(&points[i]) } else { - // queue[qID].pointID = uint32(i << 1) + 1 queue[qID].point.Neg(&points[i]) } qID++ @@ -216,7 +206,7 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B add(bucketID, &points[i], isAdd) if isFull() { executeAndReset() - processQueue() // TODO top queue only + processTopQueue() } } @@ -226,22 +216,15 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B // empty the queue flushQueue() - // for qID != 0 { - // processQueue() - // executeAndReset() - // } // reduce buckets into total // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] - var runningSum, total {{ $.TJacobianExtended }} runningSum.setInfinity() total.setInfinity() for k := len(buckets) - 1; k >= 0; k-- { - if bucketSet[k] { - runningSum.addMixed(&buckets[k]) - } + runningSum.addMixed(&buckets[k]) if !bucketsJE[k].ZZ.IsZero() { runningSum.add(&bucketsJE[k]) } diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index ed18ab1c46..cd2799bc6b 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -373,11 +373,7 @@ func fillBenchBases{{ toUpper $.PointName }}(samplePoints []{{ $.TAffine }}) { func fillBenchScalars(sampleScalars []fr.Element) { // ensure every words of the scalars are filled - var mixer fr.Element - mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") - for i := 1; i <= len(sampleScalars); i++ { - sampleScalars[i-1].SetUint64(uint64(i)). - Mul(&sampleScalars[i-1], &mixer). - FromMont() + for i := 0; i < len(sampleScalars); i++ { + sampleScalars[i].SetRandom() } } From dc404e54032d5e5b3c18191c2940759af0e16ca1 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Wed, 16 Nov 2022 11:55:01 -0600 Subject: [PATCH 32/43] fix: fix for small window size no need for stats --- ecc/bls12-377/multiexp.go | 12 +++++ ecc/bls12-377/multiexp_affine.go | 6 ++- ecc/bls12-377/multiexp_jacobian.go | 16 +++--- ecc/bls12-377/multiexp_test.go | 6 +-- ecc/bls12-378/multiexp.go | 12 +++++ ecc/bls12-378/multiexp_affine.go | 6 ++- ecc/bls12-378/multiexp_jacobian.go | 16 +++--- ecc/bls12-378/multiexp_test.go | 6 +-- ecc/bls12-381/multiexp.go | 12 +++++ ecc/bls12-381/multiexp_affine.go | 6 ++- ecc/bls12-381/multiexp_jacobian.go | 8 +-- ecc/bls12-381/multiexp_test.go | 6 +-- ecc/bls24-315/multiexp.go | 12 +++++ ecc/bls24-315/multiexp_affine.go | 6 ++- ecc/bls24-315/multiexp_jacobian.go | 16 +++--- ecc/bls24-315/multiexp_test.go | 6 +-- ecc/bls24-317/multiexp.go | 12 +++++ ecc/bls24-317/multiexp_affine.go | 6 ++- ecc/bls24-317/multiexp_jacobian.go | 8 +-- ecc/bls24-317/multiexp_test.go | 6 +-- ecc/bn254/multiexp.go | 12 +++++ ecc/bn254/multiexp_affine.go | 6 ++- ecc/bn254/multiexp_jacobian.go | 16 +++--- ecc/bn254/multiexp_test.go | 6 +-- ecc/bw6-633/multiexp.go | 30 ++++++++++- ecc/bw6-633/multiexp_affine.go | 50 ++++++++++++++---- ecc/bw6-633/multiexp_jacobian.go | 12 ++--- ecc/bw6-633/multiexp_test.go | 6 +-- ecc/bw6-756/multiexp.go | 30 ++++++++++- ecc/bw6-756/multiexp_affine.go | 50 ++++++++++++++---- ecc/bw6-756/multiexp_jacobian.go | 12 ++--- ecc/bw6-756/multiexp_test.go | 6 +-- ecc/bw6-761/multiexp.go | 34 +++++++++++- ecc/bw6-761/multiexp_affine.go | 52 +++++++++++++++---- ecc/bw6-761/multiexp_jacobian.go | 16 +++--- ecc/bw6-761/multiexp_test.go | 6 +-- internal/generator/config/curve.go | 20 +++---- internal/generator/ecc/generate.go | 17 ++++-- .../generator/ecc/template/multiexp.go.tmpl | 10 +++- .../ecc/template/multiexp_affine.go.tmpl | 4 +- .../ecc/template/multiexp_jacobian.go.tmpl | 10 +--- .../ecc/template/tests/multiexp.go.tmpl | 2 +- 42 files changed, 433 insertions(+), 158 deletions(-) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index b87523e4e0..14fdaa8bc8 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -139,6 +139,10 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { + case 1: + return processChunkG1Jacobian[bucketg1JacExtendedC1] + case 2: + return processChunkG1Jacobian[bucketg1JacExtendedC2] case 4: return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: @@ -394,6 +398,10 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { + case 1: + return processChunkG2Jacobian[bucketg2JacExtendedC1] + case 2: + return processChunkG2Jacobian[bucketg2JacExtendedC2] case 4: return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: @@ -673,6 +681,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) + if c <= 9 { + // no need to compute stats for small window sizes + return digits, chunkStats + } parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index 1504538cf5..83ce91c32a 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -659,6 +659,8 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine +type bitSetC1 [1 << (1 - 1)]bool +type bitSetC2 [1 << (2 - 1)]bool type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool type bitSetC6 [1 << (6 - 1)]bool @@ -674,7 +676,9 @@ type bitSetC15 [1 << (15 - 1)]bool type bitSetC16 [1 << (16 - 1)]bool type bitSet interface { - bitSetC4 | + bitSetC1 | + bitSetC2 | + bitSetC4 | bitSetC5 | bitSetC6 | bitSetC7 | diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go index e3c590196f..8fd4e382ff 100644 --- a/ecc/bls12-377/multiexp_jacobian.go +++ b/ecc/bls12-377/multiexp_jacobian.go @@ -61,6 +61,8 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended @@ -74,12 +76,10 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended type ibg1JacExtended interface { - bucketg1JacExtendedC2 | - bucketg1JacExtendedC1 | + bucketg1JacExtendedC1 | + bucketg1JacExtendedC2 | bucketg1JacExtendedC4 | bucketg1JacExtendedC5 | bucketg1JacExtendedC6 | @@ -140,6 +140,8 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended @@ -153,12 +155,10 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended type ibg2JacExtended interface { - bucketg2JacExtendedC2 | - bucketg2JacExtendedC1 | + bucketg2JacExtendedC1 | + bucketg2JacExtendedC2 | bucketg2JacExtendedC4 | bucketg2JacExtendedC5 | bucketg2JacExtendedC6 | diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index 8700cfd9b3..3681f4fc71 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -99,7 +99,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} + cRange := []uint64{1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 15; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 15; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index 702015ccb8..7ca2a9edeb 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -139,6 +139,10 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { + case 2: + return processChunkG1Jacobian[bucketg1JacExtendedC2] + case 3: + return processChunkG1Jacobian[bucketg1JacExtendedC3] case 4: return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: @@ -394,6 +398,10 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { + case 2: + return processChunkG2Jacobian[bucketg2JacExtendedC2] + case 3: + return processChunkG2Jacobian[bucketg2JacExtendedC3] case 4: return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: @@ -673,6 +681,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) + if c <= 9 { + // no need to compute stats for small window sizes + return digits, chunkStats + } parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index 8299397508..d8b54b76ca 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -659,6 +659,8 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine +type bitSetC2 [1 << (2 - 1)]bool +type bitSetC3 [1 << (3 - 1)]bool type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool type bitSetC6 [1 << (6 - 1)]bool @@ -674,7 +676,9 @@ type bitSetC15 [1 << (15 - 1)]bool type bitSetC16 [1 << (16 - 1)]bool type bitSet interface { - bitSetC4 | + bitSetC2 | + bitSetC3 | + bitSetC4 | bitSetC5 | bitSetC6 | bitSetC7 | diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go index 97a6ac8ac0..eb83e3c1c2 100644 --- a/ecc/bls12-378/multiexp_jacobian.go +++ b/ecc/bls12-378/multiexp_jacobian.go @@ -61,6 +61,8 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended @@ -74,12 +76,10 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended -type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended type ibg1JacExtended interface { - bucketg1JacExtendedC3 | - bucketg1JacExtendedC2 | + bucketg1JacExtendedC2 | + bucketg1JacExtendedC3 | bucketg1JacExtendedC4 | bucketg1JacExtendedC5 | bucketg1JacExtendedC6 | @@ -140,6 +140,8 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended @@ -153,12 +155,10 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended -type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended type ibg2JacExtended interface { - bucketg2JacExtendedC3 | - bucketg2JacExtendedC2 | + bucketg2JacExtendedC2 | + bucketg2JacExtendedC3 | bucketg2JacExtendedC4 | bucketg2JacExtendedC5 | bucketg2JacExtendedC6 | diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index 44c19874e4..cb553fad54 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -99,7 +99,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} + cRange := []uint64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 15; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 15; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index 91e471a850..9de4ea488f 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -139,6 +139,10 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { + case 1: + return processChunkG1Jacobian[bucketg1JacExtendedC1] + case 3: + return processChunkG1Jacobian[bucketg1JacExtendedC3] case 4: return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: @@ -394,6 +398,10 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { + case 1: + return processChunkG2Jacobian[bucketg2JacExtendedC1] + case 3: + return processChunkG2Jacobian[bucketg2JacExtendedC3] case 4: return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: @@ -673,6 +681,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) + if c <= 9 { + // no need to compute stats for small window sizes + return digits, chunkStats + } parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index b7af2292aa..bfc282b553 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -659,6 +659,8 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine +type bitSetC1 [1 << (1 - 1)]bool +type bitSetC3 [1 << (3 - 1)]bool type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool type bitSetC6 [1 << (6 - 1)]bool @@ -674,7 +676,9 @@ type bitSetC15 [1 << (15 - 1)]bool type bitSetC16 [1 << (16 - 1)]bool type bitSet interface { - bitSetC4 | + bitSetC1 | + bitSetC3 | + bitSetC4 | bitSetC5 | bitSetC6 | bitSetC7 | diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go index 17139a4f22..bc304041f6 100644 --- a/ecc/bls12-381/multiexp_jacobian.go +++ b/ecc/bls12-381/multiexp_jacobian.go @@ -61,6 +61,8 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended @@ -74,8 +76,6 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended type ibg1JacExtended interface { bucketg1JacExtendedC1 | @@ -140,6 +140,8 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended @@ -153,8 +155,6 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended type ibg2JacExtended interface { bucketg2JacExtendedC1 | diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index 946645ded0..b58a70c951 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -99,7 +99,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} + cRange := []uint64{1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 15; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 15; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index 46a8c8bd4f..1ca2222f9f 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -139,6 +139,10 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { + case 1: + return processChunkG1Jacobian[bucketg1JacExtendedC1] + case 2: + return processChunkG1Jacobian[bucketg1JacExtendedC2] case 4: return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: @@ -394,6 +398,10 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { + case 1: + return processChunkG2Jacobian[bucketg2JacExtendedC1] + case 2: + return processChunkG2Jacobian[bucketg2JacExtendedC2] case 4: return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: @@ -673,6 +681,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) + if c <= 9 { + // no need to compute stats for small window sizes + return digits, chunkStats + } parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index 6ef411f4b5..4e679fea95 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -659,6 +659,8 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine +type bitSetC1 [1 << (1 - 1)]bool +type bitSetC2 [1 << (2 - 1)]bool type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool type bitSetC6 [1 << (6 - 1)]bool @@ -674,7 +676,9 @@ type bitSetC15 [1 << (15 - 1)]bool type bitSetC16 [1 << (16 - 1)]bool type bitSet interface { - bitSetC4 | + bitSetC1 | + bitSetC2 | + bitSetC4 | bitSetC5 | bitSetC6 | bitSetC7 | diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go index 9f01ed9a7a..23310862df 100644 --- a/ecc/bls24-315/multiexp_jacobian.go +++ b/ecc/bls24-315/multiexp_jacobian.go @@ -61,6 +61,8 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended @@ -74,12 +76,10 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended type ibg1JacExtended interface { - bucketg1JacExtendedC2 | - bucketg1JacExtendedC1 | + bucketg1JacExtendedC1 | + bucketg1JacExtendedC2 | bucketg1JacExtendedC4 | bucketg1JacExtendedC5 | bucketg1JacExtendedC6 | @@ -140,6 +140,8 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended @@ -153,12 +155,10 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended type ibg2JacExtended interface { - bucketg2JacExtendedC2 | - bucketg2JacExtendedC1 | + bucketg2JacExtendedC1 | + bucketg2JacExtendedC2 | bucketg2JacExtendedC4 | bucketg2JacExtendedC5 | bucketg2JacExtendedC6 | diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index 27efcb0f13..54df2a76fa 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -99,7 +99,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} + cRange := []uint64{1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 15; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 15; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index d5232436d2..2120c3b479 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -139,6 +139,10 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { + case 1: + return processChunkG1Jacobian[bucketg1JacExtendedC1] + case 3: + return processChunkG1Jacobian[bucketg1JacExtendedC3] case 4: return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: @@ -394,6 +398,10 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { + case 1: + return processChunkG2Jacobian[bucketg2JacExtendedC1] + case 3: + return processChunkG2Jacobian[bucketg2JacExtendedC3] case 4: return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: @@ -673,6 +681,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) + if c <= 9 { + // no need to compute stats for small window sizes + return digits, chunkStats + } parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index a442f743ec..f657bf2bcf 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -659,6 +659,8 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine +type bitSetC1 [1 << (1 - 1)]bool +type bitSetC3 [1 << (3 - 1)]bool type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool type bitSetC6 [1 << (6 - 1)]bool @@ -674,7 +676,9 @@ type bitSetC15 [1 << (15 - 1)]bool type bitSetC16 [1 << (16 - 1)]bool type bitSet interface { - bitSetC4 | + bitSetC1 | + bitSetC3 | + bitSetC4 | bitSetC5 | bitSetC6 | bitSetC7 | diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go index c4fc41bc54..973219cc4b 100644 --- a/ecc/bls24-317/multiexp_jacobian.go +++ b/ecc/bls24-317/multiexp_jacobian.go @@ -61,6 +61,8 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended @@ -74,8 +76,6 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended type ibg1JacExtended interface { bucketg1JacExtendedC1 | @@ -140,6 +140,8 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended @@ -153,8 +155,6 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended type ibg2JacExtended interface { bucketg2JacExtendedC1 | diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index 95650ab5ca..0d15fed501 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -99,7 +99,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} + cRange := []uint64{1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 15; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 15; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index 6c68b58cdd..b0fb67e9af 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -139,6 +139,10 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { + case 2: + return processChunkG1Jacobian[bucketg1JacExtendedC2] + case 3: + return processChunkG1Jacobian[bucketg1JacExtendedC3] case 4: return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: @@ -394,6 +398,10 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { + case 2: + return processChunkG2Jacobian[bucketg2JacExtendedC2] + case 3: + return processChunkG2Jacobian[bucketg2JacExtendedC3] case 4: return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: @@ -673,6 +681,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) + if c <= 9 { + // no need to compute stats for small window sizes + return digits, chunkStats + } parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index 447d11c42c..1f132b885e 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -659,6 +659,8 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine +type bitSetC2 [1 << (2 - 1)]bool +type bitSetC3 [1 << (3 - 1)]bool type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool type bitSetC6 [1 << (6 - 1)]bool @@ -674,7 +676,9 @@ type bitSetC15 [1 << (15 - 1)]bool type bitSetC16 [1 << (16 - 1)]bool type bitSet interface { - bitSetC4 | + bitSetC2 | + bitSetC3 | + bitSetC4 | bitSetC5 | bitSetC6 | bitSetC7 | diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go index 9eaccec8eb..a674d4f724 100644 --- a/ecc/bn254/multiexp_jacobian.go +++ b/ecc/bn254/multiexp_jacobian.go @@ -61,6 +61,8 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended @@ -74,12 +76,10 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended -type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended type ibg1JacExtended interface { - bucketg1JacExtendedC3 | - bucketg1JacExtendedC2 | + bucketg1JacExtendedC2 | + bucketg1JacExtendedC3 | bucketg1JacExtendedC4 | bucketg1JacExtendedC5 | bucketg1JacExtendedC6 | @@ -140,6 +140,8 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended @@ -153,12 +155,10 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended -type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended type ibg2JacExtended interface { - bucketg2JacExtendedC3 | - bucketg2JacExtendedC2 | + bucketg2JacExtendedC2 | + bucketg2JacExtendedC3 | bucketg2JacExtendedC4 | bucketg2JacExtendedC5 | bucketg2JacExtendedC6 | diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index 68e7b17e41..0fcdbce7bd 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -99,7 +99,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} + cRange := []uint64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 15; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 15; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index e700b666bf..c342b9a432 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -84,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 16} + implementedCs := []uint64{4, 5, 8, 12, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -139,12 +139,23 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { + case 1: + return processChunkG1Jacobian[bucketg1JacExtendedC1] case 4: return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: return processChunkG1Jacobian[bucketg1JacExtendedC5] case 8: return processChunkG1Jacobian[bucketg1JacExtendedC8] + case 12: + const batchSize = 200 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC12] + } + return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] case 16: const batchSize = 640 // here we could check some chunk statistic (deviation, ...) to determine if calling @@ -279,7 +290,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 16} + implementedCs := []uint64{4, 5, 8, 12, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -334,12 +345,23 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { + case 1: + return processChunkG2Jacobian[bucketg2JacExtendedC1] case 4: return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: return processChunkG2Jacobian[bucketg2JacExtendedC5] case 8: return processChunkG2Jacobian[bucketg2JacExtendedC8] + case 12: + const batchSize = 200 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC12] + } + return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] case 16: const batchSize = 640 // here we could check some chunk statistic (deviation, ...) to determine if calling @@ -553,6 +575,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) + if c <= 9 { + // no need to compute stats for small window sizes + return digits, chunkStats + } parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index 493d750496..949a53f642 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -234,33 +234,45 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketG1AffineC12 [1 << (12 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine // buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { - bucketG1AffineC16 + bucketG1AffineC12 | + bucketG1AffineC16 } // array of coordinates fp.Element type cG1Affine interface { - cG1AffineC16 + cG1AffineC12 | + cG1AffineC16 } // buckets: array of G1Affine points (for the batch addition) type pG1Affine interface { - pG1AffineC16 + pG1AffineC12 | + pG1AffineC16 } // buckets: array of *G1Affine points (for the batch addition) type ppG1Affine interface { - ppG1AffineC16 + ppG1AffineC12 | + ppG1AffineC16 } // buckets: array of G1Affine queue operations (for the batch addition) type qOpsG1Affine interface { - qG1AffineC16 + qG1AffineC12 | + qG1AffineC16 } +// batch size 200 when c = 12 +type cG1AffineC12 [200]fp.Element +type pG1AffineC12 [200]G1Affine +type ppG1AffineC12 [200]*G1Affine +type qG1AffineC12 [200]batchOpG1Affine + // batch size 640 when c = 16 type cG1AffineC16 [640]fp.Element type pG1AffineC16 [640]G1Affine @@ -481,47 +493,63 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketG2AffineC12 [1 << (12 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine // buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { - bucketG2AffineC16 + bucketG2AffineC12 | + bucketG2AffineC16 } // array of coordinates fp.Element type cG2Affine interface { - cG2AffineC16 + cG2AffineC12 | + cG2AffineC16 } // buckets: array of G2Affine points (for the batch addition) type pG2Affine interface { - pG2AffineC16 + pG2AffineC12 | + pG2AffineC16 } // buckets: array of *G2Affine points (for the batch addition) type ppG2Affine interface { - ppG2AffineC16 + ppG2AffineC12 | + ppG2AffineC16 } // buckets: array of G2Affine queue operations (for the batch addition) type qOpsG2Affine interface { - qG2AffineC16 + qG2AffineC12 | + qG2AffineC16 } +// batch size 200 when c = 12 +type cG2AffineC12 [200]fp.Element +type pG2AffineC12 [200]G2Affine +type ppG2AffineC12 [200]*G2Affine +type qG2AffineC12 [200]batchOpG2Affine + // batch size 640 when c = 16 type cG2AffineC16 [640]fp.Element type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine +type bitSetC1 [1 << (1 - 1)]bool type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool type bitSetC8 [1 << (8 - 1)]bool +type bitSetC12 [1 << (12 - 1)]bool type bitSetC16 [1 << (16 - 1)]bool type bitSet interface { - bitSetC4 | + bitSetC1 | + bitSetC4 | bitSetC5 | bitSetC8 | + bitSetC12 | bitSetC16 } diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go index d31a0eaf8c..497f2697fb 100644 --- a/ecc/bw6-633/multiexp_jacobian.go +++ b/ecc/bw6-633/multiexp_jacobian.go @@ -61,19 +61,19 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended type ibg1JacExtended interface { bucketg1JacExtendedC1 | - bucketg1JacExtendedC12 | bucketg1JacExtendedC4 | bucketg1JacExtendedC5 | bucketg1JacExtendedC8 | + bucketg1JacExtendedC12 | bucketg1JacExtendedC16 } @@ -122,18 +122,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended type ibg2JacExtended interface { bucketg2JacExtendedC1 | - bucketg2JacExtendedC12 | bucketg2JacExtendedC4 | bucketg2JacExtendedC5 | bucketg2JacExtendedC8 | + bucketg2JacExtendedC12 | bucketg2JacExtendedC16 } diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index b059cbc98f..60444fc36e 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -99,7 +99,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 8, 16} + cRange := []uint64{1, 4, 5, 8, 12, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 15; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 15; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index c81d3d8c0b..72b83a7eac 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -84,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 16} + implementedCs := []uint64{4, 5, 8, 11, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -139,12 +139,23 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { + case 3: + return processChunkG1Jacobian[bucketg1JacExtendedC3] case 4: return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: return processChunkG1Jacobian[bucketg1JacExtendedC5] case 8: return processChunkG1Jacobian[bucketg1JacExtendedC8] + case 11: + const batchSize = 150 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC11] + } + return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] case 16: const batchSize = 640 // here we could check some chunk statistic (deviation, ...) to determine if calling @@ -279,7 +290,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 16} + implementedCs := []uint64{4, 5, 8, 11, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -334,12 +345,23 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { + case 3: + return processChunkG2Jacobian[bucketg2JacExtendedC3] case 4: return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: return processChunkG2Jacobian[bucketg2JacExtendedC5] case 8: return processChunkG2Jacobian[bucketg2JacExtendedC8] + case 11: + const batchSize = 150 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC11] + } + return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] case 16: const batchSize = 640 // here we could check some chunk statistic (deviation, ...) to determine if calling @@ -553,6 +575,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) + if c <= 9 { + // no need to compute stats for small window sizes + return digits, chunkStats + } parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index 003b4678bd..83cd6d1d61 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -234,33 +234,45 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketG1AffineC11 [1 << (11 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine // buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { - bucketG1AffineC16 + bucketG1AffineC11 | + bucketG1AffineC16 } // array of coordinates fp.Element type cG1Affine interface { - cG1AffineC16 + cG1AffineC11 | + cG1AffineC16 } // buckets: array of G1Affine points (for the batch addition) type pG1Affine interface { - pG1AffineC16 + pG1AffineC11 | + pG1AffineC16 } // buckets: array of *G1Affine points (for the batch addition) type ppG1Affine interface { - ppG1AffineC16 + ppG1AffineC11 | + ppG1AffineC16 } // buckets: array of G1Affine queue operations (for the batch addition) type qOpsG1Affine interface { - qG1AffineC16 + qG1AffineC11 | + qG1AffineC16 } +// batch size 150 when c = 11 +type cG1AffineC11 [150]fp.Element +type pG1AffineC11 [150]G1Affine +type ppG1AffineC11 [150]*G1Affine +type qG1AffineC11 [150]batchOpG1Affine + // batch size 640 when c = 16 type cG1AffineC16 [640]fp.Element type pG1AffineC16 [640]G1Affine @@ -481,47 +493,63 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketG2AffineC11 [1 << (11 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine // buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { - bucketG2AffineC16 + bucketG2AffineC11 | + bucketG2AffineC16 } // array of coordinates fp.Element type cG2Affine interface { - cG2AffineC16 + cG2AffineC11 | + cG2AffineC16 } // buckets: array of G2Affine points (for the batch addition) type pG2Affine interface { - pG2AffineC16 + pG2AffineC11 | + pG2AffineC16 } // buckets: array of *G2Affine points (for the batch addition) type ppG2Affine interface { - ppG2AffineC16 + ppG2AffineC11 | + ppG2AffineC16 } // buckets: array of G2Affine queue operations (for the batch addition) type qOpsG2Affine interface { - qG2AffineC16 + qG2AffineC11 | + qG2AffineC16 } +// batch size 150 when c = 11 +type cG2AffineC11 [150]fp.Element +type pG2AffineC11 [150]G2Affine +type ppG2AffineC11 [150]*G2Affine +type qG2AffineC11 [150]batchOpG2Affine + // batch size 640 when c = 16 type cG2AffineC16 [640]fp.Element type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine +type bitSetC3 [1 << (3 - 1)]bool type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool type bitSetC8 [1 << (8 - 1)]bool +type bitSetC11 [1 << (11 - 1)]bool type bitSetC16 [1 << (16 - 1)]bool type bitSet interface { - bitSetC4 | + bitSetC3 | + bitSetC4 | bitSetC5 | bitSetC8 | + bitSetC11 | bitSetC16 } diff --git a/ecc/bw6-756/multiexp_jacobian.go b/ecc/bw6-756/multiexp_jacobian.go index 86ccb23bbc..93fd87fe51 100644 --- a/ecc/bw6-756/multiexp_jacobian.go +++ b/ecc/bw6-756/multiexp_jacobian.go @@ -61,19 +61,19 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended type ibg1JacExtended interface { bucketg1JacExtendedC3 | - bucketg1JacExtendedC11 | bucketg1JacExtendedC4 | bucketg1JacExtendedC5 | bucketg1JacExtendedC8 | + bucketg1JacExtendedC11 | bucketg1JacExtendedC16 } @@ -122,18 +122,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended type ibg2JacExtended interface { bucketg2JacExtendedC3 | - bucketg2JacExtendedC11 | bucketg2JacExtendedC4 | bucketg2JacExtendedC5 | bucketg2JacExtendedC8 | + bucketg2JacExtendedC11 | bucketg2JacExtendedC16 } diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index 43529e1dbd..e7244b2e97 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -99,7 +99,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 8, 16} + cRange := []uint64{3, 4, 5, 8, 11, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 15; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 15; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index d928a013fd..3f987c5bd2 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -84,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 16} + implementedCs := []uint64{4, 5, 8, 10, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -139,12 +139,25 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { + case 2: + return processChunkG1Jacobian[bucketg1JacExtendedC2] + case 3: + return processChunkG1Jacobian[bucketg1JacExtendedC3] case 4: return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: return processChunkG1Jacobian[bucketg1JacExtendedC5] case 8: return processChunkG1Jacobian[bucketg1JacExtendedC8] + case 10: + const batchSize = 80 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC10] + } + return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] case 16: const batchSize = 640 // here we could check some chunk statistic (deviation, ...) to determine if calling @@ -279,7 +292,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 16} + implementedCs := []uint64{4, 5, 8, 10, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -334,12 +347,25 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { + case 2: + return processChunkG2Jacobian[bucketg2JacExtendedC2] + case 3: + return processChunkG2Jacobian[bucketg2JacExtendedC3] case 4: return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: return processChunkG2Jacobian[bucketg2JacExtendedC5] case 8: return processChunkG2Jacobian[bucketg2JacExtendedC8] + case 10: + const batchSize = 80 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC10] + } + return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] case 16: const batchSize = 640 // here we could check some chunk statistic (deviation, ...) to determine if calling @@ -553,6 +579,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) + if c <= 9 { + // no need to compute stats for small window sizes + return digits, chunkStats + } parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index c85ea75c3f..bfeea763cb 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -234,33 +234,45 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketG1AffineC10 [1 << (10 - 1)]G1Affine type bucketG1AffineC16 [1 << (16 - 1)]G1Affine // buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { - bucketG1AffineC16 + bucketG1AffineC10 | + bucketG1AffineC16 } // array of coordinates fp.Element type cG1Affine interface { - cG1AffineC16 + cG1AffineC10 | + cG1AffineC16 } // buckets: array of G1Affine points (for the batch addition) type pG1Affine interface { - pG1AffineC16 + pG1AffineC10 | + pG1AffineC16 } // buckets: array of *G1Affine points (for the batch addition) type ppG1Affine interface { - ppG1AffineC16 + ppG1AffineC10 | + ppG1AffineC16 } // buckets: array of G1Affine queue operations (for the batch addition) type qOpsG1Affine interface { - qG1AffineC16 + qG1AffineC10 | + qG1AffineC16 } +// batch size 80 when c = 10 +type cG1AffineC10 [80]fp.Element +type pG1AffineC10 [80]G1Affine +type ppG1AffineC10 [80]*G1Affine +type qG1AffineC10 [80]batchOpG1Affine + // batch size 640 when c = 16 type cG1AffineC16 [640]fp.Element type pG1AffineC16 [640]G1Affine @@ -481,47 +493,65 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketG2AffineC10 [1 << (10 - 1)]G2Affine type bucketG2AffineC16 [1 << (16 - 1)]G2Affine // buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { - bucketG2AffineC16 + bucketG2AffineC10 | + bucketG2AffineC16 } // array of coordinates fp.Element type cG2Affine interface { - cG2AffineC16 + cG2AffineC10 | + cG2AffineC16 } // buckets: array of G2Affine points (for the batch addition) type pG2Affine interface { - pG2AffineC16 + pG2AffineC10 | + pG2AffineC16 } // buckets: array of *G2Affine points (for the batch addition) type ppG2Affine interface { - ppG2AffineC16 + ppG2AffineC10 | + ppG2AffineC16 } // buckets: array of G2Affine queue operations (for the batch addition) type qOpsG2Affine interface { - qG2AffineC16 + qG2AffineC10 | + qG2AffineC16 } +// batch size 80 when c = 10 +type cG2AffineC10 [80]fp.Element +type pG2AffineC10 [80]G2Affine +type ppG2AffineC10 [80]*G2Affine +type qG2AffineC10 [80]batchOpG2Affine + // batch size 640 when c = 16 type cG2AffineC16 [640]fp.Element type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine +type bitSetC2 [1 << (2 - 1)]bool +type bitSetC3 [1 << (3 - 1)]bool type bitSetC4 [1 << (4 - 1)]bool type bitSetC5 [1 << (5 - 1)]bool type bitSetC8 [1 << (8 - 1)]bool +type bitSetC10 [1 << (10 - 1)]bool type bitSetC16 [1 << (16 - 1)]bool type bitSet interface { - bitSetC4 | + bitSetC2 | + bitSetC3 | + bitSetC4 | bitSetC5 | bitSetC8 | + bitSetC10 | bitSetC16 } diff --git a/ecc/bw6-761/multiexp_jacobian.go b/ecc/bw6-761/multiexp_jacobian.go index 3039c09d6c..59edd2d1bd 100644 --- a/ecc/bw6-761/multiexp_jacobian.go +++ b/ecc/bw6-761/multiexp_jacobian.go @@ -61,21 +61,21 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended -type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended +type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended type ibg1JacExtended interface { bucketg1JacExtendedC2 | bucketg1JacExtendedC3 | - bucketg1JacExtendedC10 | bucketg1JacExtendedC4 | bucketg1JacExtendedC5 | bucketg1JacExtendedC8 | + bucketg1JacExtendedC10 | bucketg1JacExtendedC16 } @@ -124,20 +124,20 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack +type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended -type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended +type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended type ibg2JacExtended interface { bucketg2JacExtendedC2 | bucketg2JacExtendedC3 | - bucketg2JacExtendedC10 | bucketg2JacExtendedC4 | bucketg2JacExtendedC5 | bucketg2JacExtendedC8 | + bucketg2JacExtendedC10 | bucketg2JacExtendedC16 } diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index 3fed44438a..8354467f4b 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -99,7 +99,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{4, 5, 8, 16} + cRange := []uint64{2, 3, 4, 5, 8, 10, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) { var testPoint G1Affine - for i := 15; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { @@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) { var testPoint G2Affine - for i := 15; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { diff --git a/internal/generator/config/curve.go b/internal/generator/config/curve.go index 1ff4926ccf..e1e8bb9499 100644 --- a/internal/generator/config/curve.go +++ b/internal/generator/config/curve.go @@ -51,16 +51,16 @@ func (c Curve) Equal(other Curve) bool { } type Point struct { - CoordType string - CoordExtDegree uint8 // value n, such that q = pⁿ - CoordExtRoot int64 // value a, such that the field is Fp[X]/(Xⁿ - a) - PointName string - GLV bool // scalar multiplication using GLV - CofactorCleaning bool // flag telling if the Cofactor cleaning is available - CRange, LastCRange []int // multiexp bucket method: generate inner methods (with const arrays) for each c - Projective bool // generate projective coordinates - A []string //A linear coefficient in Weierstrass form - B []string //B constant term in Weierstrass form + CoordType string + CoordExtDegree uint8 // value n, such that q = pⁿ + CoordExtRoot int64 // value a, such that the field is Fp[X]/(Xⁿ - a) + PointName string + GLV bool // scalar multiplication using GLV + CofactorCleaning bool // flag telling if the Cofactor cleaning is available + CRange []int // multiexp bucket method: generate inner methods (with const arrays) for each c + Projective bool // generate projective coordinates + A []string //A linear coefficient in Weierstrass form + B []string //B constant term in Weierstrass form } var Curves []Curve diff --git a/internal/generator/ecc/generate.go b/internal/generator/ecc/generate.go index f77b9d5ca8..9af8b0dd15 100644 --- a/internal/generator/ecc/generate.go +++ b/internal/generator/ecc/generate.go @@ -4,6 +4,7 @@ import ( "fmt" "path/filepath" "reflect" + "sort" "strings" "text/template" @@ -76,18 +77,26 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er } return false } + lastCG1 := make([]int, 0) for i := 0; i < len(conf.G1.CRange); i++ { lc := lastC(conf.G1.CRange[i]) - if !contains(conf.G1.CRange, lc) && !contains(conf.G1.LastCRange, lc) { - conf.G1.LastCRange = append(conf.G1.LastCRange, lc) + if !contains(conf.G1.CRange, lc) && !contains(lastCG1, lc) { + lastCG1 = append(lastCG1, lc) } } + conf.G1.CRange = append(conf.G1.CRange, lastCG1...) + sort.Ints(conf.G1.CRange) + + lastCG2 := make([]int, 0) for i := 0; i < len(conf.G2.CRange); i++ { lc := lastC(conf.G2.CRange[i]) - if !contains(conf.G2.CRange, lc) && !contains(conf.G2.LastCRange, lc) { - conf.G2.LastCRange = append(conf.G2.LastCRange, lc) + if !contains(conf.G2.CRange, lc) && !contains(lastCG2, lc) { + lastCG2 = append(lastCG2, lc) } } + conf.G2.CRange = append(conf.G2.CRange, lastCG2...) + sort.Ints(conf.G2.CRange) + bavardOpts := []func(*bavard.Bavard) error{bavard.Funcs(funcs)} if err := bgen.GenerateWithOptions(conf, packageName, "./ecc/template", bavardOpts, entries...); err != nil { return err diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index d1826081ce..c8a4e6b453 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -164,6 +164,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) + if c <= 9 { + // no need to compute stats for small window sizes + return digits, chunkStats + } parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { @@ -404,7 +408,7 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) implementedCs := []uint64{ - {{- range $c := $.CRange}} {{- if and (eq $.PointName "g1") (gt $c 21)}}{{- else}} {{$c}},{{- end}}{{- end}} + {{- range $c := $.CRange}}{{- if ge $c 4}}{{$c}},{{- end}}{{- end}} } var C uint64 // approximate cost (in group operations) @@ -460,6 +464,10 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem func getChunkProcessor{{ $.UPointName }}(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16) { switch c { + {{- range $c := $.LastCRange}} + case {{$c}}: + return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] + {{- end }} {{range $c := $.CRange}} case {{$c}}: {{- if le $c 9}} diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index 72acde2cd5..a3c609910f 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -14,8 +14,8 @@ import ( {{- end}} ) -{{ template "multiexp" dict "CoordType" .G1.CoordType "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange "LastCRange" .G1.LastCRange}} -{{ template "multiexp" dict "CoordType" .G2.CoordType "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange "LastCRange" .G2.LastCRange}} +{{ template "multiexp" dict "CoordType" .G1.CoordType "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}} +{{ template "multiexp" dict "CoordType" .G2.CoordType "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange}} {{define "multiexp" }} diff --git a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl index 7aaec9f186..3fd44311bc 100644 --- a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl +++ b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl @@ -8,8 +8,8 @@ -{{ template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange "LastCRange" .G1.LastCRange}} -{{ template "multiexp" dict "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange "LastCRange" .G2.LastCRange}} +{{ template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange }} +{{ template "multiexp" dict "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange }} @@ -66,14 +66,8 @@ func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk {{- range $c := $.CRange}} type bucket{{ $.TJacobianExtended }}C{{$c}} [1<<({{$c}}-1)]{{ $.TJacobianExtended }} {{- end}} -{{- range $c := $.LastCRange}} -type bucket{{ $.TJacobianExtended }}C{{$c}} [1<<({{$c}}-1)]{{ $.TJacobianExtended }} -{{- end}} type ib{{ $.TJacobianExtended }} interface { - {{- range $i, $c := $.LastCRange}} - bucket{{ $.TJacobianExtended }}C{{$c}} | - {{- end}} {{- range $i, $c := $.CRange}} bucket{{ $.TJacobianExtended }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}} {{- end}} diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index cd2799bc6b..dcba38d621 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -267,7 +267,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { var testPoint {{ $.TAffine }} - for i := 15; i <= pow; i++ { + for i := 5; i <= pow; i++ { using := 1 << i b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { From e76541e62e6ffd6c22987545437acb868f9caaae Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Wed, 16 Nov 2022 14:33:00 -0600 Subject: [PATCH 33/43] test: restore test for all C --- ecc/bls12-377/multiexp.go | 188 +++++++++--------- ecc/bls12-377/multiexp_test.go | 179 +++++++++-------- ecc/bls12-378/multiexp.go | 188 +++++++++--------- ecc/bls12-378/multiexp_test.go | 179 +++++++++-------- ecc/bls12-381/multiexp.go | 188 +++++++++--------- ecc/bls12-381/multiexp_test.go | 179 +++++++++-------- ecc/bls24-315/multiexp.go | 188 +++++++++--------- ecc/bls24-315/multiexp_test.go | 179 +++++++++-------- ecc/bls24-317/multiexp.go | 188 +++++++++--------- ecc/bls24-317/multiexp_test.go | 179 +++++++++-------- ecc/bn254/multiexp.go | 188 +++++++++--------- ecc/bn254/multiexp_test.go | 179 +++++++++-------- ecc/bw6-633/multiexp.go | 164 +++++++-------- ecc/bw6-633/multiexp_test.go | 179 +++++++++-------- ecc/bw6-756/multiexp.go | 164 +++++++-------- ecc/bw6-756/multiexp_test.go | 179 +++++++++-------- ecc/bw6-761/multiexp.go | 172 ++++++++-------- ecc/bw6-761/multiexp_test.go | 179 +++++++++-------- go.mod | 2 +- go.sum | 4 +- .../generator/ecc/template/multiexp.go.tmpl | 67 +++---- .../ecc/template/tests/multiexp.go.tmpl | 92 ++++----- 22 files changed, 1716 insertions(+), 1688 deletions(-) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index 14fdaa8bc8..7e519245cc 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -128,14 +128,58 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } } + _innerMsmG1(p, C, points, scalars, config) + + return p, nil +} + +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { // partition the scalars - digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks) - _innerMsmG1(p, C, points, digits, chunkStats) + nbChunks := computeNbChunks(c) - return p, nil + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g1JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := getChunkProcessorG1(c, chunkStats[j]) + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) + } + if chunkStats[j].weight >= 115 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. + chSplit := make(chan g1JacExtended, 2) + split := n / 2 + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + go func(chunkID int) { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[chunkID] <- s1 + }(j) + continue + } + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } +// getChunkProcessorG1 decides, depending on c window size and statistics for the chunk +// to return the best algorithm to process the chunk. func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { @@ -224,50 +268,6 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac { - - nbChunks := computeNbChunks(c) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack and this is critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g1JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // the last chunk may be processed with a different method than the rest, as it could be smaller. - n := len(points) - for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) - processChunk := getChunkProcessorG1(c, chunkStats[j]) - if j == int(nbChunks-1) { - processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) - } - if chunkStats[j].weight >= 115 { - // we split this in more go routines since this chunk has more work to do than the others. - // else what would happen is this go routine would finish much later than the others. - chSplit := make(chan g1JacExtended, 2) - split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func(chunkID int) { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[chunkID] <- s1 - }(j) - continue - } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - return msmReduceChunkG1Affine(p, int(c), chChunks[:]) -} - // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { var _p g1JacExtended @@ -387,14 +387,58 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } } + _innerMsmG2(p, C, points, scalars, config) + + return p, nil +} + +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { // partition the scalars - digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks) - _innerMsmG2(p, C, points, digits, chunkStats) + nbChunks := computeNbChunks(c) - return p, nil + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := getChunkProcessorG2(c, chunkStats[j]) + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) + } + if chunkStats[j].weight >= 115 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. + chSplit := make(chan g2JacExtended, 2) + split := n / 2 + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + go func(chunkID int) { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[chunkID] <- s1 + }(j) + continue + } + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } +// getChunkProcessorG2 decides, depending on c window size and statistics for the chunk +// to return the best algorithm to process the chunk. func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { @@ -483,50 +527,6 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac { - - nbChunks := computeNbChunks(c) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack and this is critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // the last chunk may be processed with a different method than the rest, as it could be smaller. - n := len(points) - for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) - processChunk := getChunkProcessorG2(c, chunkStats[j]) - if j == int(nbChunks-1) { - processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) - } - if chunkStats[j].weight >= 115 { - // we split this in more go routines since this chunk has more work to do than the others. - // else what would happen is this go routine would finish much later than the others. - chSplit := make(chan g2JacExtended, 2) - split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func(chunkID int) { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[chunkID] <- s1 - }(j) - continue - } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) -} - // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { var _p g2JacExtended diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index 3681f4fc71..4510933ea1 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -21,6 +21,7 @@ import ( "math/big" "math/bits" "math/rand" + "runtime" "sync" "testing" "time" @@ -35,9 +36,9 @@ func TestMultiExpG1(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 2 + parameters.MinSuccessfulTests = 3 } else { - parameters.MinSuccessfulTests = nbFuzzShort + parameters.MinSuccessfulTests = nbFuzzShort * 2 } properties := gopter.NewProperties(parameters) @@ -125,9 +126,8 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test of all C - results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -159,12 +159,14 @@ func TestMultiExpG1(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() + if i%10 == 0 { + samplePointsZero[i].setInfinity() + } } results := make([]G1Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test for all C - results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -222,32 +224,32 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element - // sampleScalarsSmallValues [nbSamples]fr.Element - // sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - // copy(sampleScalarsSmallValues[:],sampleScalars[:]) - // copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // // this means first chunk is going to have more work to do and should be split into several go routines - // for i:=0; i < len(sampleScalarsSmallValues);i++ { - // if i % 5 == 0 { - // sampleScalarsSmallValues[i].SetZero() - // sampleScalarsSmallValues[i][0] = 1 - // } - // } - - // // bad case for batch affine because scalar distribution might look uniform - // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // // to process small batches of additions to flush its queue of conflicted points. - // for i:=0; i < len(sampleScalarsRedundant);i+=100 { - // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - // } - // } + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // this means first chunk is going to have more work to do and should be split into several go routines + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } fillBenchBasesG1(samplePoints[:]) @@ -263,19 +265,19 @@ func BenchmarkMultiExpG1(b *testing.B) { } }) - // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - // } - // }) - - // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - // } - // }) + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -354,9 +356,9 @@ func TestMultiExpG2(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 2 + parameters.MinSuccessfulTests = 3 } else { - parameters.MinSuccessfulTests = nbFuzzShort + parameters.MinSuccessfulTests = nbFuzzShort * 2 } properties := gopter.NewProperties(parameters) @@ -442,9 +444,8 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test of all C - results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -476,12 +477,14 @@ func TestMultiExpG2(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() + if i%10 == 0 { + samplePointsZero[i].setInfinity() + } } results := make([]G2Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test for all C - results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -539,32 +542,32 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element - // sampleScalarsSmallValues [nbSamples]fr.Element - // sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - // copy(sampleScalarsSmallValues[:],sampleScalars[:]) - // copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // // this means first chunk is going to have more work to do and should be split into several go routines - // for i:=0; i < len(sampleScalarsSmallValues);i++ { - // if i % 5 == 0 { - // sampleScalarsSmallValues[i].SetZero() - // sampleScalarsSmallValues[i][0] = 1 - // } - // } - - // // bad case for batch affine because scalar distribution might look uniform - // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // // to process small batches of additions to flush its queue of conflicted points. - // for i:=0; i < len(sampleScalarsRedundant);i+=100 { - // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - // } - // } + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // this means first chunk is going to have more work to do and should be split into several go routines + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } fillBenchBasesG2(samplePoints[:]) @@ -580,19 +583,19 @@ func BenchmarkMultiExpG2(b *testing.B) { } }) - // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - // } - // }) - - // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - // } - // }) + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index 7ca2a9edeb..63476d87c9 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -128,14 +128,58 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } } + _innerMsmG1(p, C, points, scalars, config) + + return p, nil +} + +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { // partition the scalars - digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks) - _innerMsmG1(p, C, points, digits, chunkStats) + nbChunks := computeNbChunks(c) - return p, nil + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g1JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := getChunkProcessorG1(c, chunkStats[j]) + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) + } + if chunkStats[j].weight >= 115 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. + chSplit := make(chan g1JacExtended, 2) + split := n / 2 + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + go func(chunkID int) { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[chunkID] <- s1 + }(j) + continue + } + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } +// getChunkProcessorG1 decides, depending on c window size and statistics for the chunk +// to return the best algorithm to process the chunk. func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { @@ -224,50 +268,6 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac { - - nbChunks := computeNbChunks(c) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack and this is critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g1JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // the last chunk may be processed with a different method than the rest, as it could be smaller. - n := len(points) - for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) - processChunk := getChunkProcessorG1(c, chunkStats[j]) - if j == int(nbChunks-1) { - processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) - } - if chunkStats[j].weight >= 115 { - // we split this in more go routines since this chunk has more work to do than the others. - // else what would happen is this go routine would finish much later than the others. - chSplit := make(chan g1JacExtended, 2) - split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func(chunkID int) { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[chunkID] <- s1 - }(j) - continue - } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - return msmReduceChunkG1Affine(p, int(c), chChunks[:]) -} - // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { var _p g1JacExtended @@ -387,14 +387,58 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } } + _innerMsmG2(p, C, points, scalars, config) + + return p, nil +} + +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { // partition the scalars - digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks) - _innerMsmG2(p, C, points, digits, chunkStats) + nbChunks := computeNbChunks(c) - return p, nil + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := getChunkProcessorG2(c, chunkStats[j]) + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) + } + if chunkStats[j].weight >= 115 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. + chSplit := make(chan g2JacExtended, 2) + split := n / 2 + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + go func(chunkID int) { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[chunkID] <- s1 + }(j) + continue + } + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } +// getChunkProcessorG2 decides, depending on c window size and statistics for the chunk +// to return the best algorithm to process the chunk. func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { @@ -483,50 +527,6 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac { - - nbChunks := computeNbChunks(c) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack and this is critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // the last chunk may be processed with a different method than the rest, as it could be smaller. - n := len(points) - for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) - processChunk := getChunkProcessorG2(c, chunkStats[j]) - if j == int(nbChunks-1) { - processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) - } - if chunkStats[j].weight >= 115 { - // we split this in more go routines since this chunk has more work to do than the others. - // else what would happen is this go routine would finish much later than the others. - chSplit := make(chan g2JacExtended, 2) - split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func(chunkID int) { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[chunkID] <- s1 - }(j) - continue - } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) -} - // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { var _p g2JacExtended diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index cb553fad54..5bef450dc0 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -21,6 +21,7 @@ import ( "math/big" "math/bits" "math/rand" + "runtime" "sync" "testing" "time" @@ -35,9 +36,9 @@ func TestMultiExpG1(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 2 + parameters.MinSuccessfulTests = 3 } else { - parameters.MinSuccessfulTests = nbFuzzShort + parameters.MinSuccessfulTests = nbFuzzShort * 2 } properties := gopter.NewProperties(parameters) @@ -125,9 +126,8 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test of all C - results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -159,12 +159,14 @@ func TestMultiExpG1(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() + if i%10 == 0 { + samplePointsZero[i].setInfinity() + } } results := make([]G1Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test for all C - results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -222,32 +224,32 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element - // sampleScalarsSmallValues [nbSamples]fr.Element - // sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - // copy(sampleScalarsSmallValues[:],sampleScalars[:]) - // copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // // this means first chunk is going to have more work to do and should be split into several go routines - // for i:=0; i < len(sampleScalarsSmallValues);i++ { - // if i % 5 == 0 { - // sampleScalarsSmallValues[i].SetZero() - // sampleScalarsSmallValues[i][0] = 1 - // } - // } - - // // bad case for batch affine because scalar distribution might look uniform - // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // // to process small batches of additions to flush its queue of conflicted points. - // for i:=0; i < len(sampleScalarsRedundant);i+=100 { - // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - // } - // } + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // this means first chunk is going to have more work to do and should be split into several go routines + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } fillBenchBasesG1(samplePoints[:]) @@ -263,19 +265,19 @@ func BenchmarkMultiExpG1(b *testing.B) { } }) - // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - // } - // }) - - // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - // } - // }) + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -354,9 +356,9 @@ func TestMultiExpG2(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 2 + parameters.MinSuccessfulTests = 3 } else { - parameters.MinSuccessfulTests = nbFuzzShort + parameters.MinSuccessfulTests = nbFuzzShort * 2 } properties := gopter.NewProperties(parameters) @@ -442,9 +444,8 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test of all C - results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -476,12 +477,14 @@ func TestMultiExpG2(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() + if i%10 == 0 { + samplePointsZero[i].setInfinity() + } } results := make([]G2Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test for all C - results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -539,32 +542,32 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element - // sampleScalarsSmallValues [nbSamples]fr.Element - // sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - // copy(sampleScalarsSmallValues[:],sampleScalars[:]) - // copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // // this means first chunk is going to have more work to do and should be split into several go routines - // for i:=0; i < len(sampleScalarsSmallValues);i++ { - // if i % 5 == 0 { - // sampleScalarsSmallValues[i].SetZero() - // sampleScalarsSmallValues[i][0] = 1 - // } - // } - - // // bad case for batch affine because scalar distribution might look uniform - // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // // to process small batches of additions to flush its queue of conflicted points. - // for i:=0; i < len(sampleScalarsRedundant);i+=100 { - // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - // } - // } + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // this means first chunk is going to have more work to do and should be split into several go routines + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } fillBenchBasesG2(samplePoints[:]) @@ -580,19 +583,19 @@ func BenchmarkMultiExpG2(b *testing.B) { } }) - // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - // } - // }) - - // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - // } - // }) + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index 9de4ea488f..4a389f9e4f 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -128,14 +128,58 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } } + _innerMsmG1(p, C, points, scalars, config) + + return p, nil +} + +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { // partition the scalars - digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks) - _innerMsmG1(p, C, points, digits, chunkStats) + nbChunks := computeNbChunks(c) - return p, nil + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g1JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := getChunkProcessorG1(c, chunkStats[j]) + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) + } + if chunkStats[j].weight >= 115 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. + chSplit := make(chan g1JacExtended, 2) + split := n / 2 + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + go func(chunkID int) { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[chunkID] <- s1 + }(j) + continue + } + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } +// getChunkProcessorG1 decides, depending on c window size and statistics for the chunk +// to return the best algorithm to process the chunk. func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { @@ -224,50 +268,6 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac { - - nbChunks := computeNbChunks(c) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack and this is critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g1JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // the last chunk may be processed with a different method than the rest, as it could be smaller. - n := len(points) - for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) - processChunk := getChunkProcessorG1(c, chunkStats[j]) - if j == int(nbChunks-1) { - processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) - } - if chunkStats[j].weight >= 115 { - // we split this in more go routines since this chunk has more work to do than the others. - // else what would happen is this go routine would finish much later than the others. - chSplit := make(chan g1JacExtended, 2) - split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func(chunkID int) { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[chunkID] <- s1 - }(j) - continue - } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - return msmReduceChunkG1Affine(p, int(c), chChunks[:]) -} - // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { var _p g1JacExtended @@ -387,14 +387,58 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } } + _innerMsmG2(p, C, points, scalars, config) + + return p, nil +} + +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { // partition the scalars - digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks) - _innerMsmG2(p, C, points, digits, chunkStats) + nbChunks := computeNbChunks(c) - return p, nil + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := getChunkProcessorG2(c, chunkStats[j]) + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) + } + if chunkStats[j].weight >= 115 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. + chSplit := make(chan g2JacExtended, 2) + split := n / 2 + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + go func(chunkID int) { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[chunkID] <- s1 + }(j) + continue + } + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } +// getChunkProcessorG2 decides, depending on c window size and statistics for the chunk +// to return the best algorithm to process the chunk. func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { @@ -483,50 +527,6 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac { - - nbChunks := computeNbChunks(c) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack and this is critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // the last chunk may be processed with a different method than the rest, as it could be smaller. - n := len(points) - for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) - processChunk := getChunkProcessorG2(c, chunkStats[j]) - if j == int(nbChunks-1) { - processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) - } - if chunkStats[j].weight >= 115 { - // we split this in more go routines since this chunk has more work to do than the others. - // else what would happen is this go routine would finish much later than the others. - chSplit := make(chan g2JacExtended, 2) - split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func(chunkID int) { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[chunkID] <- s1 - }(j) - continue - } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) -} - // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { var _p g2JacExtended diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index b58a70c951..fdd7809b56 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -21,6 +21,7 @@ import ( "math/big" "math/bits" "math/rand" + "runtime" "sync" "testing" "time" @@ -35,9 +36,9 @@ func TestMultiExpG1(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 2 + parameters.MinSuccessfulTests = 3 } else { - parameters.MinSuccessfulTests = nbFuzzShort + parameters.MinSuccessfulTests = nbFuzzShort * 2 } properties := gopter.NewProperties(parameters) @@ -125,9 +126,8 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test of all C - results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -159,12 +159,14 @@ func TestMultiExpG1(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() + if i%10 == 0 { + samplePointsZero[i].setInfinity() + } } results := make([]G1Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test for all C - results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -222,32 +224,32 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element - // sampleScalarsSmallValues [nbSamples]fr.Element - // sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - // copy(sampleScalarsSmallValues[:],sampleScalars[:]) - // copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // // this means first chunk is going to have more work to do and should be split into several go routines - // for i:=0; i < len(sampleScalarsSmallValues);i++ { - // if i % 5 == 0 { - // sampleScalarsSmallValues[i].SetZero() - // sampleScalarsSmallValues[i][0] = 1 - // } - // } - - // // bad case for batch affine because scalar distribution might look uniform - // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // // to process small batches of additions to flush its queue of conflicted points. - // for i:=0; i < len(sampleScalarsRedundant);i+=100 { - // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - // } - // } + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // this means first chunk is going to have more work to do and should be split into several go routines + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } fillBenchBasesG1(samplePoints[:]) @@ -263,19 +265,19 @@ func BenchmarkMultiExpG1(b *testing.B) { } }) - // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - // } - // }) - - // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - // } - // }) + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -354,9 +356,9 @@ func TestMultiExpG2(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 2 + parameters.MinSuccessfulTests = 3 } else { - parameters.MinSuccessfulTests = nbFuzzShort + parameters.MinSuccessfulTests = nbFuzzShort * 2 } properties := gopter.NewProperties(parameters) @@ -442,9 +444,8 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test of all C - results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -476,12 +477,14 @@ func TestMultiExpG2(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() + if i%10 == 0 { + samplePointsZero[i].setInfinity() + } } results := make([]G2Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test for all C - results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -539,32 +542,32 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element - // sampleScalarsSmallValues [nbSamples]fr.Element - // sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - // copy(sampleScalarsSmallValues[:],sampleScalars[:]) - // copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // // this means first chunk is going to have more work to do and should be split into several go routines - // for i:=0; i < len(sampleScalarsSmallValues);i++ { - // if i % 5 == 0 { - // sampleScalarsSmallValues[i].SetZero() - // sampleScalarsSmallValues[i][0] = 1 - // } - // } - - // // bad case for batch affine because scalar distribution might look uniform - // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // // to process small batches of additions to flush its queue of conflicted points. - // for i:=0; i < len(sampleScalarsRedundant);i+=100 { - // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - // } - // } + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // this means first chunk is going to have more work to do and should be split into several go routines + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } fillBenchBasesG2(samplePoints[:]) @@ -580,19 +583,19 @@ func BenchmarkMultiExpG2(b *testing.B) { } }) - // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - // } - // }) - - // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - // } - // }) + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index 1ca2222f9f..9b3b5eb0d8 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -128,14 +128,58 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } } + _innerMsmG1(p, C, points, scalars, config) + + return p, nil +} + +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { // partition the scalars - digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks) - _innerMsmG1(p, C, points, digits, chunkStats) + nbChunks := computeNbChunks(c) - return p, nil + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g1JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := getChunkProcessorG1(c, chunkStats[j]) + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) + } + if chunkStats[j].weight >= 115 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. + chSplit := make(chan g1JacExtended, 2) + split := n / 2 + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + go func(chunkID int) { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[chunkID] <- s1 + }(j) + continue + } + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } +// getChunkProcessorG1 decides, depending on c window size and statistics for the chunk +// to return the best algorithm to process the chunk. func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { @@ -224,50 +268,6 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac { - - nbChunks := computeNbChunks(c) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack and this is critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g1JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // the last chunk may be processed with a different method than the rest, as it could be smaller. - n := len(points) - for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) - processChunk := getChunkProcessorG1(c, chunkStats[j]) - if j == int(nbChunks-1) { - processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) - } - if chunkStats[j].weight >= 115 { - // we split this in more go routines since this chunk has more work to do than the others. - // else what would happen is this go routine would finish much later than the others. - chSplit := make(chan g1JacExtended, 2) - split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func(chunkID int) { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[chunkID] <- s1 - }(j) - continue - } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - return msmReduceChunkG1Affine(p, int(c), chChunks[:]) -} - // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { var _p g1JacExtended @@ -387,14 +387,58 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } } + _innerMsmG2(p, C, points, scalars, config) + + return p, nil +} + +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { // partition the scalars - digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks) - _innerMsmG2(p, C, points, digits, chunkStats) + nbChunks := computeNbChunks(c) - return p, nil + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := getChunkProcessorG2(c, chunkStats[j]) + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) + } + if chunkStats[j].weight >= 115 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. + chSplit := make(chan g2JacExtended, 2) + split := n / 2 + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + go func(chunkID int) { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[chunkID] <- s1 + }(j) + continue + } + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } +// getChunkProcessorG2 decides, depending on c window size and statistics for the chunk +// to return the best algorithm to process the chunk. func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { @@ -483,50 +527,6 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac { - - nbChunks := computeNbChunks(c) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack and this is critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // the last chunk may be processed with a different method than the rest, as it could be smaller. - n := len(points) - for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) - processChunk := getChunkProcessorG2(c, chunkStats[j]) - if j == int(nbChunks-1) { - processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) - } - if chunkStats[j].weight >= 115 { - // we split this in more go routines since this chunk has more work to do than the others. - // else what would happen is this go routine would finish much later than the others. - chSplit := make(chan g2JacExtended, 2) - split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func(chunkID int) { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[chunkID] <- s1 - }(j) - continue - } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) -} - // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { var _p g2JacExtended diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index 54df2a76fa..bdb2e0167a 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -21,6 +21,7 @@ import ( "math/big" "math/bits" "math/rand" + "runtime" "sync" "testing" "time" @@ -35,9 +36,9 @@ func TestMultiExpG1(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 2 + parameters.MinSuccessfulTests = 3 } else { - parameters.MinSuccessfulTests = nbFuzzShort + parameters.MinSuccessfulTests = nbFuzzShort * 2 } properties := gopter.NewProperties(parameters) @@ -125,9 +126,8 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test of all C - results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -159,12 +159,14 @@ func TestMultiExpG1(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() + if i%10 == 0 { + samplePointsZero[i].setInfinity() + } } results := make([]G1Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test for all C - results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -222,32 +224,32 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element - // sampleScalarsSmallValues [nbSamples]fr.Element - // sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - // copy(sampleScalarsSmallValues[:],sampleScalars[:]) - // copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // // this means first chunk is going to have more work to do and should be split into several go routines - // for i:=0; i < len(sampleScalarsSmallValues);i++ { - // if i % 5 == 0 { - // sampleScalarsSmallValues[i].SetZero() - // sampleScalarsSmallValues[i][0] = 1 - // } - // } - - // // bad case for batch affine because scalar distribution might look uniform - // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // // to process small batches of additions to flush its queue of conflicted points. - // for i:=0; i < len(sampleScalarsRedundant);i+=100 { - // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - // } - // } + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // this means first chunk is going to have more work to do and should be split into several go routines + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } fillBenchBasesG1(samplePoints[:]) @@ -263,19 +265,19 @@ func BenchmarkMultiExpG1(b *testing.B) { } }) - // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - // } - // }) - - // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - // } - // }) + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -354,9 +356,9 @@ func TestMultiExpG2(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 2 + parameters.MinSuccessfulTests = 3 } else { - parameters.MinSuccessfulTests = nbFuzzShort + parameters.MinSuccessfulTests = nbFuzzShort * 2 } properties := gopter.NewProperties(parameters) @@ -442,9 +444,8 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test of all C - results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -476,12 +477,14 @@ func TestMultiExpG2(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() + if i%10 == 0 { + samplePointsZero[i].setInfinity() + } } results := make([]G2Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test for all C - results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -539,32 +542,32 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element - // sampleScalarsSmallValues [nbSamples]fr.Element - // sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - // copy(sampleScalarsSmallValues[:],sampleScalars[:]) - // copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // // this means first chunk is going to have more work to do and should be split into several go routines - // for i:=0; i < len(sampleScalarsSmallValues);i++ { - // if i % 5 == 0 { - // sampleScalarsSmallValues[i].SetZero() - // sampleScalarsSmallValues[i][0] = 1 - // } - // } - - // // bad case for batch affine because scalar distribution might look uniform - // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // // to process small batches of additions to flush its queue of conflicted points. - // for i:=0; i < len(sampleScalarsRedundant);i+=100 { - // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - // } - // } + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // this means first chunk is going to have more work to do and should be split into several go routines + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } fillBenchBasesG2(samplePoints[:]) @@ -580,19 +583,19 @@ func BenchmarkMultiExpG2(b *testing.B) { } }) - // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - // } - // }) - - // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - // } - // }) + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index 2120c3b479..6b98c52875 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -128,14 +128,58 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } } + _innerMsmG1(p, C, points, scalars, config) + + return p, nil +} + +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { // partition the scalars - digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks) - _innerMsmG1(p, C, points, digits, chunkStats) + nbChunks := computeNbChunks(c) - return p, nil + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g1JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := getChunkProcessorG1(c, chunkStats[j]) + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) + } + if chunkStats[j].weight >= 115 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. + chSplit := make(chan g1JacExtended, 2) + split := n / 2 + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + go func(chunkID int) { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[chunkID] <- s1 + }(j) + continue + } + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } +// getChunkProcessorG1 decides, depending on c window size and statistics for the chunk +// to return the best algorithm to process the chunk. func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { @@ -224,50 +268,6 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac { - - nbChunks := computeNbChunks(c) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack and this is critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g1JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // the last chunk may be processed with a different method than the rest, as it could be smaller. - n := len(points) - for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) - processChunk := getChunkProcessorG1(c, chunkStats[j]) - if j == int(nbChunks-1) { - processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) - } - if chunkStats[j].weight >= 115 { - // we split this in more go routines since this chunk has more work to do than the others. - // else what would happen is this go routine would finish much later than the others. - chSplit := make(chan g1JacExtended, 2) - split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func(chunkID int) { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[chunkID] <- s1 - }(j) - continue - } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - return msmReduceChunkG1Affine(p, int(c), chChunks[:]) -} - // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { var _p g1JacExtended @@ -387,14 +387,58 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } } + _innerMsmG2(p, C, points, scalars, config) + + return p, nil +} + +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { // partition the scalars - digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks) - _innerMsmG2(p, C, points, digits, chunkStats) + nbChunks := computeNbChunks(c) - return p, nil + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := getChunkProcessorG2(c, chunkStats[j]) + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) + } + if chunkStats[j].weight >= 115 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. + chSplit := make(chan g2JacExtended, 2) + split := n / 2 + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + go func(chunkID int) { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[chunkID] <- s1 + }(j) + continue + } + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } +// getChunkProcessorG2 decides, depending on c window size and statistics for the chunk +// to return the best algorithm to process the chunk. func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { @@ -483,50 +527,6 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac { - - nbChunks := computeNbChunks(c) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack and this is critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // the last chunk may be processed with a different method than the rest, as it could be smaller. - n := len(points) - for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) - processChunk := getChunkProcessorG2(c, chunkStats[j]) - if j == int(nbChunks-1) { - processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) - } - if chunkStats[j].weight >= 115 { - // we split this in more go routines since this chunk has more work to do than the others. - // else what would happen is this go routine would finish much later than the others. - chSplit := make(chan g2JacExtended, 2) - split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func(chunkID int) { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[chunkID] <- s1 - }(j) - continue - } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) -} - // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { var _p g2JacExtended diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index 0d15fed501..48420037d0 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -21,6 +21,7 @@ import ( "math/big" "math/bits" "math/rand" + "runtime" "sync" "testing" "time" @@ -35,9 +36,9 @@ func TestMultiExpG1(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 2 + parameters.MinSuccessfulTests = 3 } else { - parameters.MinSuccessfulTests = nbFuzzShort + parameters.MinSuccessfulTests = nbFuzzShort * 2 } properties := gopter.NewProperties(parameters) @@ -125,9 +126,8 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test of all C - results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -159,12 +159,14 @@ func TestMultiExpG1(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() + if i%10 == 0 { + samplePointsZero[i].setInfinity() + } } results := make([]G1Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test for all C - results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -222,32 +224,32 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element - // sampleScalarsSmallValues [nbSamples]fr.Element - // sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - // copy(sampleScalarsSmallValues[:],sampleScalars[:]) - // copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // // this means first chunk is going to have more work to do and should be split into several go routines - // for i:=0; i < len(sampleScalarsSmallValues);i++ { - // if i % 5 == 0 { - // sampleScalarsSmallValues[i].SetZero() - // sampleScalarsSmallValues[i][0] = 1 - // } - // } - - // // bad case for batch affine because scalar distribution might look uniform - // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // // to process small batches of additions to flush its queue of conflicted points. - // for i:=0; i < len(sampleScalarsRedundant);i+=100 { - // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - // } - // } + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // this means first chunk is going to have more work to do and should be split into several go routines + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } fillBenchBasesG1(samplePoints[:]) @@ -263,19 +265,19 @@ func BenchmarkMultiExpG1(b *testing.B) { } }) - // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - // } - // }) - - // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - // } - // }) + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -354,9 +356,9 @@ func TestMultiExpG2(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 2 + parameters.MinSuccessfulTests = 3 } else { - parameters.MinSuccessfulTests = nbFuzzShort + parameters.MinSuccessfulTests = nbFuzzShort * 2 } properties := gopter.NewProperties(parameters) @@ -442,9 +444,8 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test of all C - results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -476,12 +477,14 @@ func TestMultiExpG2(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() + if i%10 == 0 { + samplePointsZero[i].setInfinity() + } } results := make([]G2Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test for all C - results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -539,32 +542,32 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element - // sampleScalarsSmallValues [nbSamples]fr.Element - // sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - // copy(sampleScalarsSmallValues[:],sampleScalars[:]) - // copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // // this means first chunk is going to have more work to do and should be split into several go routines - // for i:=0; i < len(sampleScalarsSmallValues);i++ { - // if i % 5 == 0 { - // sampleScalarsSmallValues[i].SetZero() - // sampleScalarsSmallValues[i][0] = 1 - // } - // } - - // // bad case for batch affine because scalar distribution might look uniform - // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // // to process small batches of additions to flush its queue of conflicted points. - // for i:=0; i < len(sampleScalarsRedundant);i+=100 { - // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - // } - // } + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // this means first chunk is going to have more work to do and should be split into several go routines + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } fillBenchBasesG2(samplePoints[:]) @@ -580,19 +583,19 @@ func BenchmarkMultiExpG2(b *testing.B) { } }) - // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - // } - // }) - - // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - // } - // }) + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index b0fb67e9af..791fc0c19e 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -128,14 +128,58 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } } + _innerMsmG1(p, C, points, scalars, config) + + return p, nil +} + +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { // partition the scalars - digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks) - _innerMsmG1(p, C, points, digits, chunkStats) + nbChunks := computeNbChunks(c) - return p, nil + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g1JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := getChunkProcessorG1(c, chunkStats[j]) + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) + } + if chunkStats[j].weight >= 115 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. + chSplit := make(chan g1JacExtended, 2) + split := n / 2 + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + go func(chunkID int) { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[chunkID] <- s1 + }(j) + continue + } + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } +// getChunkProcessorG1 decides, depending on c window size and statistics for the chunk +// to return the best algorithm to process the chunk. func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { @@ -224,50 +268,6 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch } } -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac { - - nbChunks := computeNbChunks(c) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack and this is critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g1JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g1JacExtended, 1) - } - - // the last chunk may be processed with a different method than the rest, as it could be smaller. - n := len(points) - for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) - processChunk := getChunkProcessorG1(c, chunkStats[j]) - if j == int(nbChunks-1) { - processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) - } - if chunkStats[j].weight >= 115 { - // we split this in more go routines since this chunk has more work to do than the others. - // else what would happen is this go routine would finish much later than the others. - chSplit := make(chan g1JacExtended, 2) - split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func(chunkID int) { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[chunkID] <- s1 - }(j) - continue - } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - return msmReduceChunkG1Affine(p, int(c), chChunks[:]) -} - // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { var _p g1JacExtended @@ -387,14 +387,58 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } } + _innerMsmG2(p, C, points, scalars, config) + + return p, nil +} + +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { // partition the scalars - digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks) - _innerMsmG2(p, C, points, digits, chunkStats) + nbChunks := computeNbChunks(c) - return p, nil + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := getChunkProcessorG2(c, chunkStats[j]) + if j == int(nbChunks-1) { + processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) + } + if chunkStats[j].weight >= 115 { + // we split this in more go routines since this chunk has more work to do than the others. + // else what would happen is this go routine would finish much later than the others. + chSplit := make(chan g2JacExtended, 2) + split := n / 2 + go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) + go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) + go func(chunkID int) { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.add(&s2) + chChunks[chunkID] <- s1 + }(j) + continue + } + go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } +// getChunkProcessorG2 decides, depending on c window size and statistics for the chunk +// to return the best algorithm to process the chunk. func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { @@ -483,50 +527,6 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch } } -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac { - - nbChunks := computeNbChunks(c) - - // for each chunk, spawn one go routine that'll loop through all the scalars in the - // corresponding bit-window - // note that buckets is an array allocated on the stack and this is critical for performance - - // each go routine sends its result in chChunks[i] channel - chChunks := make([]chan g2JacExtended, nbChunks) - for i := 0; i < len(chChunks); i++ { - chChunks[i] = make(chan g2JacExtended, 1) - } - - // the last chunk may be processed with a different method than the rest, as it could be smaller. - n := len(points) - for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) - processChunk := getChunkProcessorG2(c, chunkStats[j]) - if j == int(nbChunks-1) { - processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) - } - if chunkStats[j].weight >= 115 { - // we split this in more go routines since this chunk has more work to do than the others. - // else what would happen is this go routine would finish much later than the others. - chSplit := make(chan g2JacExtended, 2) - split := n / 2 - go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split]) - go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n]) - go func(chunkID int) { - s1 := <-chSplit - s2 := <-chSplit - close(chSplit) - s1.add(&s2) - chChunks[chunkID] <- s1 - }(j) - continue - } - go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) - } - - return msmReduceChunkG2Affine(p, int(c), chChunks[:]) -} - // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { var _p g2JacExtended diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index 0fcdbce7bd..00cd01348e 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -21,6 +21,7 @@ import ( "math/big" "math/bits" "math/rand" + "runtime" "sync" "testing" "time" @@ -35,9 +36,9 @@ func TestMultiExpG1(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 2 + parameters.MinSuccessfulTests = 3 } else { - parameters.MinSuccessfulTests = nbFuzzShort + parameters.MinSuccessfulTests = nbFuzzShort * 2 } properties := gopter.NewProperties(parameters) @@ -125,9 +126,8 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test of all C - results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -159,12 +159,14 @@ func TestMultiExpG1(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() + if i%10 == 0 { + samplePointsZero[i].setInfinity() + } } results := make([]G1Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test for all C - results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -222,32 +224,32 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element - // sampleScalarsSmallValues [nbSamples]fr.Element - // sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - // copy(sampleScalarsSmallValues[:],sampleScalars[:]) - // copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // // this means first chunk is going to have more work to do and should be split into several go routines - // for i:=0; i < len(sampleScalarsSmallValues);i++ { - // if i % 5 == 0 { - // sampleScalarsSmallValues[i].SetZero() - // sampleScalarsSmallValues[i][0] = 1 - // } - // } - - // // bad case for batch affine because scalar distribution might look uniform - // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // // to process small batches of additions to flush its queue of conflicted points. - // for i:=0; i < len(sampleScalarsRedundant);i+=100 { - // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - // } - // } + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // this means first chunk is going to have more work to do and should be split into several go routines + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } fillBenchBasesG1(samplePoints[:]) @@ -263,19 +265,19 @@ func BenchmarkMultiExpG1(b *testing.B) { } }) - // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - // } - // }) - - // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - // } - // }) + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -354,9 +356,9 @@ func TestMultiExpG2(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 2 + parameters.MinSuccessfulTests = 3 } else { - parameters.MinSuccessfulTests = nbFuzzShort + parameters.MinSuccessfulTests = nbFuzzShort * 2 } properties := gopter.NewProperties(parameters) @@ -442,9 +444,8 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test of all C - results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -476,12 +477,14 @@ func TestMultiExpG2(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() + if i%10 == 0 { + samplePointsZero[i].setInfinity() + } } results := make([]G2Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test for all C - results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -539,32 +542,32 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element - // sampleScalarsSmallValues [nbSamples]fr.Element - // sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - // copy(sampleScalarsSmallValues[:],sampleScalars[:]) - // copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // // this means first chunk is going to have more work to do and should be split into several go routines - // for i:=0; i < len(sampleScalarsSmallValues);i++ { - // if i % 5 == 0 { - // sampleScalarsSmallValues[i].SetZero() - // sampleScalarsSmallValues[i][0] = 1 - // } - // } - - // // bad case for batch affine because scalar distribution might look uniform - // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // // to process small batches of additions to flush its queue of conflicted points. - // for i:=0; i < len(sampleScalarsRedundant);i+=100 { - // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - // } - // } + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // this means first chunk is going to have more work to do and should be split into several go routines + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } fillBenchBasesG2(samplePoints[:]) @@ -580,19 +583,19 @@ func BenchmarkMultiExpG2(b *testing.B) { } }) - // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - // } - // }) - - // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - // } - // }) + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index c342b9a432..eeb7d4e43a 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -128,50 +128,14 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } } - // partition the scalars - digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - _innerMsmG1(p, C, points, digits, chunkStats) + _innerMsmG1(p, C, points, scalars, config) return p, nil } -func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { - switch c { - - case 1: - return processChunkG1Jacobian[bucketg1JacExtendedC1] - case 4: - return processChunkG1Jacobian[bucketg1JacExtendedC4] - case 5: - return processChunkG1Jacobian[bucketg1JacExtendedC5] - case 8: - return processChunkG1Jacobian[bucketg1JacExtendedC8] - case 12: - const batchSize = 200 - // here we could check some chunk statistic (deviation, ...) to determine if calling - // the batch affine version is worth it. - if stat.nbBucketFilled < batchSize { - // clear indicator that batch affine method is not appropriate here. - return processChunkG1Jacobian[bucketg1JacExtendedC12] - } - return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] - case 16: - const batchSize = 640 - // here we could check some chunk statistic (deviation, ...) to determine if calling - // the batch affine version is worth it. - if stat.nbBucketFilled < batchSize { - // clear indicator that batch affine method is not appropriate here. - return processChunkG1Jacobian[bucketg1JacExtendedC16] - } - return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] - default: - // panic("will not happen c != previous values is not generated by templates") - return processChunkG1Jacobian[bucketg1JacExtendedC16] - } -} - -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { + // partition the scalars + digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks) nbChunks := computeNbChunks(c) @@ -188,7 +152,6 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -215,6 +178,43 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } +// getChunkProcessorG1 decides, depending on c window size and statistics for the chunk +// to return the best algorithm to process the chunk. +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { + switch c { + + case 1: + return processChunkG1Jacobian[bucketg1JacExtendedC1] + case 4: + return processChunkG1Jacobian[bucketg1JacExtendedC4] + case 5: + return processChunkG1Jacobian[bucketg1JacExtendedC5] + case 8: + return processChunkG1Jacobian[bucketg1JacExtendedC8] + case 12: + const batchSize = 200 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC12] + } + return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] + case 16: + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } + return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + default: + // panic("will not happen c != previous values is not generated by templates") + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } +} + // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { var _p g1JacExtended @@ -334,50 +334,14 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } } - // partition the scalars - digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - _innerMsmG2(p, C, points, digits, chunkStats) + _innerMsmG2(p, C, points, scalars, config) return p, nil } -func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { - switch c { - - case 1: - return processChunkG2Jacobian[bucketg2JacExtendedC1] - case 4: - return processChunkG2Jacobian[bucketg2JacExtendedC4] - case 5: - return processChunkG2Jacobian[bucketg2JacExtendedC5] - case 8: - return processChunkG2Jacobian[bucketg2JacExtendedC8] - case 12: - const batchSize = 200 - // here we could check some chunk statistic (deviation, ...) to determine if calling - // the batch affine version is worth it. - if stat.nbBucketFilled < batchSize { - // clear indicator that batch affine method is not appropriate here. - return processChunkG2Jacobian[bucketg2JacExtendedC12] - } - return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] - case 16: - const batchSize = 640 - // here we could check some chunk statistic (deviation, ...) to determine if calling - // the batch affine version is worth it. - if stat.nbBucketFilled < batchSize { - // clear indicator that batch affine method is not appropriate here. - return processChunkG2Jacobian[bucketg2JacExtendedC16] - } - return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] - default: - // panic("will not happen c != previous values is not generated by templates") - return processChunkG2Jacobian[bucketg2JacExtendedC16] - } -} - -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { + // partition the scalars + digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks) nbChunks := computeNbChunks(c) @@ -394,7 +358,6 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -421,6 +384,43 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } +// getChunkProcessorG2 decides, depending on c window size and statistics for the chunk +// to return the best algorithm to process the chunk. +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { + switch c { + + case 1: + return processChunkG2Jacobian[bucketg2JacExtendedC1] + case 4: + return processChunkG2Jacobian[bucketg2JacExtendedC4] + case 5: + return processChunkG2Jacobian[bucketg2JacExtendedC5] + case 8: + return processChunkG2Jacobian[bucketg2JacExtendedC8] + case 12: + const batchSize = 200 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC12] + } + return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] + case 16: + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } + return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + default: + // panic("will not happen c != previous values is not generated by templates") + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } +} + // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { var _p g2JacExtended diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index 60444fc36e..6572865155 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -21,6 +21,7 @@ import ( "math/big" "math/bits" "math/rand" + "runtime" "sync" "testing" "time" @@ -35,9 +36,9 @@ func TestMultiExpG1(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 2 + parameters.MinSuccessfulTests = 3 } else { - parameters.MinSuccessfulTests = nbFuzzShort + parameters.MinSuccessfulTests = nbFuzzShort * 2 } properties := gopter.NewProperties(parameters) @@ -125,9 +126,8 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test of all C - results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -159,12 +159,14 @@ func TestMultiExpG1(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() + if i%10 == 0 { + samplePointsZero[i].setInfinity() + } } results := make([]G1Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test for all C - results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -222,32 +224,32 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element - // sampleScalarsSmallValues [nbSamples]fr.Element - // sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - // copy(sampleScalarsSmallValues[:],sampleScalars[:]) - // copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // // this means first chunk is going to have more work to do and should be split into several go routines - // for i:=0; i < len(sampleScalarsSmallValues);i++ { - // if i % 5 == 0 { - // sampleScalarsSmallValues[i].SetZero() - // sampleScalarsSmallValues[i][0] = 1 - // } - // } - - // // bad case for batch affine because scalar distribution might look uniform - // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // // to process small batches of additions to flush its queue of conflicted points. - // for i:=0; i < len(sampleScalarsRedundant);i+=100 { - // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - // } - // } + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // this means first chunk is going to have more work to do and should be split into several go routines + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } fillBenchBasesG1(samplePoints[:]) @@ -263,19 +265,19 @@ func BenchmarkMultiExpG1(b *testing.B) { } }) - // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - // } - // }) - - // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - // } - // }) + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -354,9 +356,9 @@ func TestMultiExpG2(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 2 + parameters.MinSuccessfulTests = 3 } else { - parameters.MinSuccessfulTests = nbFuzzShort + parameters.MinSuccessfulTests = nbFuzzShort * 2 } properties := gopter.NewProperties(parameters) @@ -442,9 +444,8 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test of all C - results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -476,12 +477,14 @@ func TestMultiExpG2(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() + if i%10 == 0 { + samplePointsZero[i].setInfinity() + } } results := make([]G2Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test for all C - results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -539,32 +542,32 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element - // sampleScalarsSmallValues [nbSamples]fr.Element - // sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - // copy(sampleScalarsSmallValues[:],sampleScalars[:]) - // copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // // this means first chunk is going to have more work to do and should be split into several go routines - // for i:=0; i < len(sampleScalarsSmallValues);i++ { - // if i % 5 == 0 { - // sampleScalarsSmallValues[i].SetZero() - // sampleScalarsSmallValues[i][0] = 1 - // } - // } - - // // bad case for batch affine because scalar distribution might look uniform - // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // // to process small batches of additions to flush its queue of conflicted points. - // for i:=0; i < len(sampleScalarsRedundant);i+=100 { - // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - // } - // } + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // this means first chunk is going to have more work to do and should be split into several go routines + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } fillBenchBasesG2(samplePoints[:]) @@ -580,19 +583,19 @@ func BenchmarkMultiExpG2(b *testing.B) { } }) - // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - // } - // }) - - // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - // } - // }) + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index 72b83a7eac..35135f2959 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -128,50 +128,14 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } } - // partition the scalars - digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - _innerMsmG1(p, C, points, digits, chunkStats) + _innerMsmG1(p, C, points, scalars, config) return p, nil } -func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { - switch c { - - case 3: - return processChunkG1Jacobian[bucketg1JacExtendedC3] - case 4: - return processChunkG1Jacobian[bucketg1JacExtendedC4] - case 5: - return processChunkG1Jacobian[bucketg1JacExtendedC5] - case 8: - return processChunkG1Jacobian[bucketg1JacExtendedC8] - case 11: - const batchSize = 150 - // here we could check some chunk statistic (deviation, ...) to determine if calling - // the batch affine version is worth it. - if stat.nbBucketFilled < batchSize { - // clear indicator that batch affine method is not appropriate here. - return processChunkG1Jacobian[bucketg1JacExtendedC11] - } - return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] - case 16: - const batchSize = 640 - // here we could check some chunk statistic (deviation, ...) to determine if calling - // the batch affine version is worth it. - if stat.nbBucketFilled < batchSize { - // clear indicator that batch affine method is not appropriate here. - return processChunkG1Jacobian[bucketg1JacExtendedC16] - } - return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] - default: - // panic("will not happen c != previous values is not generated by templates") - return processChunkG1Jacobian[bucketg1JacExtendedC16] - } -} - -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { + // partition the scalars + digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks) nbChunks := computeNbChunks(c) @@ -188,7 +152,6 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -215,6 +178,43 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } +// getChunkProcessorG1 decides, depending on c window size and statistics for the chunk +// to return the best algorithm to process the chunk. +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { + switch c { + + case 3: + return processChunkG1Jacobian[bucketg1JacExtendedC3] + case 4: + return processChunkG1Jacobian[bucketg1JacExtendedC4] + case 5: + return processChunkG1Jacobian[bucketg1JacExtendedC5] + case 8: + return processChunkG1Jacobian[bucketg1JacExtendedC8] + case 11: + const batchSize = 150 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC11] + } + return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] + case 16: + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } + return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + default: + // panic("will not happen c != previous values is not generated by templates") + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } +} + // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { var _p g1JacExtended @@ -334,50 +334,14 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } } - // partition the scalars - digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - _innerMsmG2(p, C, points, digits, chunkStats) + _innerMsmG2(p, C, points, scalars, config) return p, nil } -func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { - switch c { - - case 3: - return processChunkG2Jacobian[bucketg2JacExtendedC3] - case 4: - return processChunkG2Jacobian[bucketg2JacExtendedC4] - case 5: - return processChunkG2Jacobian[bucketg2JacExtendedC5] - case 8: - return processChunkG2Jacobian[bucketg2JacExtendedC8] - case 11: - const batchSize = 150 - // here we could check some chunk statistic (deviation, ...) to determine if calling - // the batch affine version is worth it. - if stat.nbBucketFilled < batchSize { - // clear indicator that batch affine method is not appropriate here. - return processChunkG2Jacobian[bucketg2JacExtendedC11] - } - return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] - case 16: - const batchSize = 640 - // here we could check some chunk statistic (deviation, ...) to determine if calling - // the batch affine version is worth it. - if stat.nbBucketFilled < batchSize { - // clear indicator that batch affine method is not appropriate here. - return processChunkG2Jacobian[bucketg2JacExtendedC16] - } - return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] - default: - // panic("will not happen c != previous values is not generated by templates") - return processChunkG2Jacobian[bucketg2JacExtendedC16] - } -} - -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { + // partition the scalars + digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks) nbChunks := computeNbChunks(c) @@ -394,7 +358,6 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -421,6 +384,43 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } +// getChunkProcessorG2 decides, depending on c window size and statistics for the chunk +// to return the best algorithm to process the chunk. +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { + switch c { + + case 3: + return processChunkG2Jacobian[bucketg2JacExtendedC3] + case 4: + return processChunkG2Jacobian[bucketg2JacExtendedC4] + case 5: + return processChunkG2Jacobian[bucketg2JacExtendedC5] + case 8: + return processChunkG2Jacobian[bucketg2JacExtendedC8] + case 11: + const batchSize = 150 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC11] + } + return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] + case 16: + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } + return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + default: + // panic("will not happen c != previous values is not generated by templates") + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } +} + // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { var _p g2JacExtended diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index e7244b2e97..f6d3a94ca9 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -21,6 +21,7 @@ import ( "math/big" "math/bits" "math/rand" + "runtime" "sync" "testing" "time" @@ -35,9 +36,9 @@ func TestMultiExpG1(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 2 + parameters.MinSuccessfulTests = 3 } else { - parameters.MinSuccessfulTests = nbFuzzShort + parameters.MinSuccessfulTests = nbFuzzShort * 2 } properties := gopter.NewProperties(parameters) @@ -125,9 +126,8 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test of all C - results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -159,12 +159,14 @@ func TestMultiExpG1(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() + if i%10 == 0 { + samplePointsZero[i].setInfinity() + } } results := make([]G1Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test for all C - results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -222,32 +224,32 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element - // sampleScalarsSmallValues [nbSamples]fr.Element - // sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - // copy(sampleScalarsSmallValues[:],sampleScalars[:]) - // copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // // this means first chunk is going to have more work to do and should be split into several go routines - // for i:=0; i < len(sampleScalarsSmallValues);i++ { - // if i % 5 == 0 { - // sampleScalarsSmallValues[i].SetZero() - // sampleScalarsSmallValues[i][0] = 1 - // } - // } - - // // bad case for batch affine because scalar distribution might look uniform - // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // // to process small batches of additions to flush its queue of conflicted points. - // for i:=0; i < len(sampleScalarsRedundant);i+=100 { - // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - // } - // } + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // this means first chunk is going to have more work to do and should be split into several go routines + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } fillBenchBasesG1(samplePoints[:]) @@ -263,19 +265,19 @@ func BenchmarkMultiExpG1(b *testing.B) { } }) - // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - // } - // }) - - // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - // } - // }) + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -354,9 +356,9 @@ func TestMultiExpG2(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 2 + parameters.MinSuccessfulTests = 3 } else { - parameters.MinSuccessfulTests = nbFuzzShort + parameters.MinSuccessfulTests = nbFuzzShort * 2 } properties := gopter.NewProperties(parameters) @@ -442,9 +444,8 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test of all C - results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -476,12 +477,14 @@ func TestMultiExpG2(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() + if i%10 == 0 { + samplePointsZero[i].setInfinity() + } } results := make([]G2Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test for all C - results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -539,32 +542,32 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element - // sampleScalarsSmallValues [nbSamples]fr.Element - // sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - // copy(sampleScalarsSmallValues[:],sampleScalars[:]) - // copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // // this means first chunk is going to have more work to do and should be split into several go routines - // for i:=0; i < len(sampleScalarsSmallValues);i++ { - // if i % 5 == 0 { - // sampleScalarsSmallValues[i].SetZero() - // sampleScalarsSmallValues[i][0] = 1 - // } - // } - - // // bad case for batch affine because scalar distribution might look uniform - // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // // to process small batches of additions to flush its queue of conflicted points. - // for i:=0; i < len(sampleScalarsRedundant);i+=100 { - // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - // } - // } + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // this means first chunk is going to have more work to do and should be split into several go routines + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } fillBenchBasesG2(samplePoints[:]) @@ -580,19 +583,19 @@ func BenchmarkMultiExpG2(b *testing.B) { } }) - // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - // } - // }) - - // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - // } - // }) + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index 3f987c5bd2..5037c2e0be 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -128,52 +128,14 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul } } - // partition the scalars - digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - _innerMsmG1(p, C, points, digits, chunkStats) + _innerMsmG1(p, C, points, scalars, config) return p, nil } -func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { - switch c { - - case 2: - return processChunkG1Jacobian[bucketg1JacExtendedC2] - case 3: - return processChunkG1Jacobian[bucketg1JacExtendedC3] - case 4: - return processChunkG1Jacobian[bucketg1JacExtendedC4] - case 5: - return processChunkG1Jacobian[bucketg1JacExtendedC5] - case 8: - return processChunkG1Jacobian[bucketg1JacExtendedC8] - case 10: - const batchSize = 80 - // here we could check some chunk statistic (deviation, ...) to determine if calling - // the batch affine version is worth it. - if stat.nbBucketFilled < batchSize { - // clear indicator that batch affine method is not appropriate here. - return processChunkG1Jacobian[bucketg1JacExtendedC10] - } - return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] - case 16: - const batchSize = 640 - // here we could check some chunk statistic (deviation, ...) to determine if calling - // the batch affine version is worth it. - if stat.nbBucketFilled < batchSize { - // clear indicator that batch affine method is not appropriate here. - return processChunkG1Jacobian[bucketg1JacExtendedC16] - } - return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] - default: - // panic("will not happen c != previous values is not generated by templates") - return processChunkG1Jacobian[bucketg1JacExtendedC16] - } -} - -func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac { +func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { + // partition the scalars + digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks) nbChunks := computeNbChunks(c) @@ -190,7 +152,6 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG1(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) @@ -217,6 +178,45 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt return msmReduceChunkG1Affine(p, int(c), chChunks[:]) } +// getChunkProcessorG1 decides, depending on c window size and statistics for the chunk +// to return the best algorithm to process the chunk. +func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { + switch c { + + case 2: + return processChunkG1Jacobian[bucketg1JacExtendedC2] + case 3: + return processChunkG1Jacobian[bucketg1JacExtendedC3] + case 4: + return processChunkG1Jacobian[bucketg1JacExtendedC4] + case 5: + return processChunkG1Jacobian[bucketg1JacExtendedC5] + case 8: + return processChunkG1Jacobian[bucketg1JacExtendedC8] + case 10: + const batchSize = 80 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC10] + } + return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] + case 16: + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } + return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] + default: + // panic("will not happen c != previous values is not generated by templates") + return processChunkG1Jacobian[bucketg1JacExtendedC16] + } +} + // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { var _p g1JacExtended @@ -336,52 +336,14 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul } } - // partition the scalars - digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - _innerMsmG2(p, C, points, digits, chunkStats) + _innerMsmG2(p, C, points, scalars, config) return p, nil } -func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { - switch c { - - case 2: - return processChunkG2Jacobian[bucketg2JacExtendedC2] - case 3: - return processChunkG2Jacobian[bucketg2JacExtendedC3] - case 4: - return processChunkG2Jacobian[bucketg2JacExtendedC4] - case 5: - return processChunkG2Jacobian[bucketg2JacExtendedC5] - case 8: - return processChunkG2Jacobian[bucketg2JacExtendedC8] - case 10: - const batchSize = 80 - // here we could check some chunk statistic (deviation, ...) to determine if calling - // the batch affine version is worth it. - if stat.nbBucketFilled < batchSize { - // clear indicator that batch affine method is not appropriate here. - return processChunkG2Jacobian[bucketg2JacExtendedC10] - } - return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] - case 16: - const batchSize = 640 - // here we could check some chunk statistic (deviation, ...) to determine if calling - // the batch affine version is worth it. - if stat.nbBucketFilled < batchSize { - // clear indicator that batch affine method is not appropriate here. - return processChunkG2Jacobian[bucketg2JacExtendedC16] - } - return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] - default: - // panic("will not happen c != previous values is not generated by templates") - return processChunkG2Jacobian[bucketg2JacExtendedC16] - } -} - -func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac { +func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { + // partition the scalars + digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks) nbChunks := computeNbChunks(c) @@ -398,7 +360,6 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessorG2(c, chunkStats[j]) if j == int(nbChunks-1) { processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) @@ -425,6 +386,45 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt return msmReduceChunkG2Affine(p, int(c), chChunks[:]) } +// getChunkProcessorG2 decides, depending on c window size and statistics for the chunk +// to return the best algorithm to process the chunk. +func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { + switch c { + + case 2: + return processChunkG2Jacobian[bucketg2JacExtendedC2] + case 3: + return processChunkG2Jacobian[bucketg2JacExtendedC3] + case 4: + return processChunkG2Jacobian[bucketg2JacExtendedC4] + case 5: + return processChunkG2Jacobian[bucketg2JacExtendedC5] + case 8: + return processChunkG2Jacobian[bucketg2JacExtendedC8] + case 10: + const batchSize = 80 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC10] + } + return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] + case 16: + const batchSize = 640 + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } + return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] + default: + // panic("will not happen c != previous values is not generated by templates") + return processChunkG2Jacobian[bucketg2JacExtendedC16] + } +} + // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { var _p g2JacExtended diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index 8354467f4b..83f5b41bbd 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -21,6 +21,7 @@ import ( "math/big" "math/bits" "math/rand" + "runtime" "sync" "testing" "time" @@ -35,9 +36,9 @@ func TestMultiExpG1(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 2 + parameters.MinSuccessfulTests = 3 } else { - parameters.MinSuccessfulTests = nbFuzzShort + parameters.MinSuccessfulTests = nbFuzzShort * 2 } properties := gopter.NewProperties(parameters) @@ -125,9 +126,8 @@ func TestMultiExpG1(t *testing.T) { } results := make([]G1Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test of all C - results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -159,12 +159,14 @@ func TestMultiExpG1(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() + if i%10 == 0 { + samplePointsZero[i].setInfinity() + } } results := make([]G1Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test for all C - results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -222,32 +224,32 @@ func BenchmarkMultiExpG1(b *testing.B) { ) var ( - samplePoints [nbSamples]G1Affine - sampleScalars [nbSamples]fr.Element - // sampleScalarsSmallValues [nbSamples]fr.Element - // sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G1Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - // copy(sampleScalarsSmallValues[:],sampleScalars[:]) - // copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // // this means first chunk is going to have more work to do and should be split into several go routines - // for i:=0; i < len(sampleScalarsSmallValues);i++ { - // if i % 5 == 0 { - // sampleScalarsSmallValues[i].SetZero() - // sampleScalarsSmallValues[i][0] = 1 - // } - // } - - // // bad case for batch affine because scalar distribution might look uniform - // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // // to process small batches of additions to flush its queue of conflicted points. - // for i:=0; i < len(sampleScalarsRedundant);i+=100 { - // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - // } - // } + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // this means first chunk is going to have more work to do and should be split into several go routines + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } fillBenchBasesG1(samplePoints[:]) @@ -263,19 +265,19 @@ func BenchmarkMultiExpG1(b *testing.B) { } }) - // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - // } - // }) - - // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - // } - // }) + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } @@ -354,9 +356,9 @@ func TestMultiExpG2(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 2 + parameters.MinSuccessfulTests = 3 } else { - parameters.MinSuccessfulTests = nbFuzzShort + parameters.MinSuccessfulTests = nbFuzzShort * 2 } properties := gopter.NewProperties(parameters) @@ -442,9 +444,8 @@ func TestMultiExpG2(t *testing.T) { } results := make([]G2Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test of all C - results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -476,12 +477,14 @@ func TestMultiExpG2(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() + if i%10 == 0 { + samplePointsZero[i].setInfinity() + } } results := make([]G2Jac, len(cRange)) - for i := range cRange { - // TODO @gbotrel restore test for all C - results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -539,32 +542,32 @@ func BenchmarkMultiExpG2(b *testing.B) { ) var ( - samplePoints [nbSamples]G2Affine - sampleScalars [nbSamples]fr.Element - // sampleScalarsSmallValues [nbSamples]fr.Element - // sampleScalarsRedundant [nbSamples]fr.Element + samplePoints [nbSamples]G2Affine + sampleScalars [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - // copy(sampleScalarsSmallValues[:],sampleScalars[:]) - // copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // // this means first chunk is going to have more work to do and should be split into several go routines - // for i:=0; i < len(sampleScalarsSmallValues);i++ { - // if i % 5 == 0 { - // sampleScalarsSmallValues[i].SetZero() - // sampleScalarsSmallValues[i][0] = 1 - // } - // } - - // // bad case for batch affine because scalar distribution might look uniform - // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // // to process small batches of additions to flush its queue of conflicted points. - // for i:=0; i < len(sampleScalarsRedundant);i+=100 { - // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - // } - // } + copy(sampleScalarsSmallValues[:], sampleScalars[:]) + copy(sampleScalarsRedundant[:], sampleScalars[:]) + + // this means first chunk is going to have more work to do and should be split into several go routines + for i := 0; i < len(sampleScalarsSmallValues); i++ { + if i%5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. + for i := 0; i < len(sampleScalarsRedundant); i += 100 { + for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } fillBenchBasesG2(samplePoints[:]) @@ -580,19 +583,19 @@ func BenchmarkMultiExpG2(b *testing.B) { } }) - // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - // } - // }) - - // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - // } - // }) + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{}) + } + }) } } diff --git a/go.mod b/go.mod index f1fd1fb56c..fe822ccf53 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,7 @@ require ( github.com/spf13/cobra v1.5.0 github.com/stretchr/testify v1.8.0 golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa - golang.org/x/sys v0.0.0-20220727055044-e65921a090b8 + golang.org/x/sys v0.2.0 ) require ( diff --git a/go.sum b/go.sum index a0175604ce..b3ae5f84f9 100644 --- a/go.sum +++ b/go.sum @@ -26,8 +26,8 @@ github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PK github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa h1:zuSxTR4o9y82ebqCUJYNGJbGPo6sKVl54f/TVDObg1c= golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= -golang.org/x/sys v0.0.0-20220727055044-e65921a090b8 h1:dyU22nBWzrmTQxtNrr4dzVOvaw35nUYE279vF9UmsI8= -golang.org/x/sys v0.0.0-20220727055044-e65921a090b8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.2.0 h1:ljd4t30dBnAvMZaQCevtY0xLLD0A+bRZXbgLMLU1F/A= +golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index c8a4e6b453..13e0171a3c 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -453,44 +453,15 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem } } - // partition the scalars - digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) - - _innerMsm{{ $.UPointName }}(p, C, points, digits, chunkStats) + _innerMsm{{ $.UPointName }}(p, C, points, scalars, config) return p, nil } +func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig) *{{ $.TJacobian }} { + // partition the scalars + digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks) -func getChunkProcessor{{ $.UPointName }}(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16) { - switch c { - {{- range $c := $.LastCRange}} - case {{$c}}: - return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] - {{- end }} - {{range $c := $.CRange}} - case {{$c}}: - {{- if le $c 9}} - return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] - {{- else}} - const batchSize = {{batchSize $c}} - // here we could check some chunk statistic (deviation, ...) to determine if calling - // the batch affine version is worth it. - if stat.nbBucketFilled < batchSize { - // clear indicator that batch affine method is not appropriate here. - return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] - } - return processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TJacobianExtended }}C{{$c}}, bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}] - {{- end}} - {{- end}} - default: - // panic("will not happen c != previous values is not generated by templates") - return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C16] - } -} - -func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint16, chunkStats []chunkStat) *{{ $.TJacobian }} { - nbChunks := computeNbChunks(c) // for each chunk, spawn one go routine that'll loop through all the scalars in the @@ -506,7 +477,6 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T // the last chunk may be processed with a different method than the rest, as it could be smaller. n := len(points) for j := int(nbChunks - 1); j >= 0; j-- { - // fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled) processChunk := getChunkProcessor{{ $.UPointName }}(c, chunkStats[j]) if j == int(nbChunks - 1) { processChunk = getChunkProcessor{{ $.UPointName }}(lastC(c), chunkStats[j]) @@ -534,6 +504,35 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T } +// getChunkProcessor{{ $.UPointName }} decides, depending on c window size and statistics for the chunk +// to return the best algorithm to process the chunk. +func getChunkProcessor{{ $.UPointName }}(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16) { + switch c { + {{- range $c := $.LastCRange}} + case {{$c}}: + return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] + {{- end }} + {{range $c := $.CRange}} + case {{$c}}: + {{- if le $c 9}} + return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] + {{- else}} + const batchSize = {{batchSize $c}} + // here we could check some chunk statistic (deviation, ...) to determine if calling + // the batch affine version is worth it. + if stat.nbBucketFilled < batchSize { + // clear indicator that batch affine method is not appropriate here. + return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] + } + return processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TJacobianExtended }}C{{$c}}, bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}] + {{- end}} + {{- end}} + default: + // panic("will not happen c != previous values is not generated by templates") + return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C16] + } +} + // msmReduceChunk{{ $.TAffine }} reduces the weighted sum of the buckets into the result of the multiExp func msmReduceChunk{{ $.TAffine }}(p *{{ $.TJacobian }}, c int, chChunks []chan {{ $.TJacobianExtended }}) *{{ $.TJacobian }} { diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index dcba38d621..5079d33d3b 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -10,6 +10,7 @@ import ( "fmt" "time" + "runtime" "math/rand" "math/big" "testing" @@ -23,8 +24,8 @@ import ( ) -{{template "multiexp" dict "PointName" .G1.PointName "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}} -{{template "multiexp" dict "PointName" .G2.PointName "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange}} +{{template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}} +{{template "multiexp" dict "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange}} {{define "multiexp" }} @@ -32,9 +33,9 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 2 + parameters.MinSuccessfulTests = 3 } else { - parameters.MinSuccessfulTests = nbFuzzShort + parameters.MinSuccessfulTests = nbFuzzShort * 2 } @@ -100,7 +101,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { // cRange is generated from template and contains the available parameters for the multiexp window size {{- if eq $.PointName "g1" }} cRange := []uint64{ - {{- range $c := $.CRange}} {{- if and (eq $.PointName "g1") (gt $c 21)}}{{- else}} {{$c}},{{- end}}{{- end}} + {{- range $c := $.CRange}}{{$c}},{{- end}} } if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) @@ -133,9 +134,8 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { results := make([]{{ $.TJacobian }}, len(cRange)) - for i, _ := range cRange { - // TODO @gbotrel restore test of all C - results[i].MultiExp( samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsm{{ $.UPointName }}(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i:=1; i < len(results);i++ { if !results[i].Equal(&results[i-1]) { @@ -167,12 +167,14 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() + if i % 10 == 0 { + samplePointsZero[i].setInfinity() + } } results := make([]{{ $.TJacobian }}, len(cRange)) - for i, _ := range cRange { - // TODO @gbotrel restore test for all C - results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{}) + for i, c := range cRange { + _innerMsm{{ $.UPointName }}(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { @@ -237,30 +239,30 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { var ( samplePoints [nbSamples]{{ $.TAffine }} sampleScalars [nbSamples]fr.Element - // sampleScalarsSmallValues [nbSamples]fr.Element - // sampleScalarsRedundant [nbSamples]fr.Element + sampleScalarsSmallValues [nbSamples]fr.Element + sampleScalarsRedundant [nbSamples]fr.Element ) fillBenchScalars(sampleScalars[:]) - // copy(sampleScalarsSmallValues[:],sampleScalars[:]) - // copy(sampleScalarsRedundant[:],sampleScalars[:]) - - // // this means first chunk is going to have more work to do and should be split into several go routines - // for i:=0; i < len(sampleScalarsSmallValues);i++ { - // if i % 5 == 0 { - // sampleScalarsSmallValues[i].SetZero() - // sampleScalarsSmallValues[i][0] = 1 - // } - // } - - // // bad case for batch affine because scalar distribution might look uniform - // // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine - // // to process small batches of additions to flush its queue of conflicted points. - // for i:=0; i < len(sampleScalarsRedundant);i+=100 { - // for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { - // sampleScalarsRedundant[j] = sampleScalarsRedundant[i] - // } - // } + copy(sampleScalarsSmallValues[:],sampleScalars[:]) + copy(sampleScalarsRedundant[:],sampleScalars[:]) + + // this means first chunk is going to have more work to do and should be split into several go routines + for i:=0; i < len(sampleScalarsSmallValues);i++ { + if i % 5 == 0 { + sampleScalarsSmallValues[i].SetZero() + sampleScalarsSmallValues[i][0] = 1 + } + } + + // bad case for batch affine because scalar distribution might look uniform + // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine + // to process small batches of additions to flush its queue of conflicted points. + for i:=0; i < len(sampleScalarsRedundant);i+=100 { + for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ { + sampleScalarsRedundant[j] = sampleScalarsRedundant[i] + } + } fillBenchBases{{ toUpper $.PointName }}(samplePoints[:]) @@ -277,19 +279,19 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { } }) - // b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) - // } - // }) - - // b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { - // b.ResetTimer() - // for j := 0; j < b.N; j++ { - // testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) - // } - // }) + b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{}) + } + }) + + b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{}) + } + }) } } From 2a8d8e6d7b27382dd3d66cc18d2bbfa2e612892f Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Wed, 16 Nov 2022 14:41:28 -0600 Subject: [PATCH 34/43] test: restore bench batchadd --- ecc/bls12-377/g1_test.go | 41 +++++++++--------- ecc/bls12-377/g2_test.go | 41 +++++++++--------- ecc/bls12-378/g1_test.go | 41 +++++++++--------- ecc/bls12-378/g2_test.go | 41 +++++++++--------- ecc/bls12-381/g1_test.go | 41 +++++++++--------- ecc/bls12-381/g2_test.go | 41 +++++++++--------- ecc/bls24-315/g1_test.go | 41 +++++++++--------- ecc/bls24-315/g2_test.go | 41 +++++++++--------- ecc/bls24-317/g1_test.go | 41 +++++++++--------- ecc/bls24-317/g2_test.go | 41 +++++++++--------- ecc/bn254/g1_test.go | 41 +++++++++--------- ecc/bn254/g2_test.go | 41 +++++++++--------- ecc/bw6-633/g1_test.go | 41 +++++++++--------- ecc/bw6-633/g2_test.go | 41 +++++++++--------- ecc/bw6-756/g1_test.go | 41 +++++++++--------- ecc/bw6-756/g2_test.go | 41 +++++++++--------- ecc/bw6-761/g1_test.go | 41 +++++++++--------- ecc/bw6-761/g2_test.go | 41 +++++++++--------- .../ecc/template/tests/point.go.tmpl | 43 ++++++++++--------- 19 files changed, 400 insertions(+), 381 deletions(-) diff --git a/ecc/bls12-377/g1_test.go b/ecc/bls12-377/g1_test.go index eb09d3cca4..0dffff841c 100644 --- a/ecc/bls12-377/g1_test.go +++ b/ecc/bls12-377/g1_test.go @@ -19,6 +19,7 @@ package bls12377 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-377/fp" @@ -499,32 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } -// func BenchmarkBatchAddG1Affine(b *testing.B) { -// var P, R [MAX_BATCH_SIZE]G1Affine -// var RR, PP [MAX_BATCH_SIZE]*G1Affine -// var ridx [MAX_BATCH_SIZE]int +func BenchmarkBatchAddG1Affine(b *testing.B) { -// fillBenchBasesG1(P[:]) -// fillBenchBasesG1(R[:]) + var P, R pG1AffineC16 + var RR ppG1AffineC16 + ridx := make([]int, len(P)) -// for i:=0; i < len(ridx);i++ { -// ridx[i] = i -// } + // TODO P == R may produce skewed benches + fillBenchBasesG1(P[:]) + fillBenchBasesG1(R[:]) -// // random permute -// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } -// for i, ri := range ridx { -// RR[i] = &R[ri] -// PP[i] = &P[ri] -// } + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) -// b.ResetTimer() -// for i := 0; i < b.N; i++ { -// batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) -// } + for i, ri := range ridx { + RR[i] = &R[ri] + } -// } + b.ResetTimer() + for i := 0; i < b.N; i++ { + batchAddG1Affine[pG1AffineC16, ppG1AffineC16, cG1AffineC16](&RR, &P, len(P)) + } +} func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bls12-377/g2_test.go b/ecc/bls12-377/g2_test.go index c0653c32af..25bcb60ee4 100644 --- a/ecc/bls12-377/g2_test.go +++ b/ecc/bls12-377/g2_test.go @@ -19,6 +19,7 @@ package bls12377 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-377/internal/fptower" @@ -505,32 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } -// func BenchmarkBatchAddG2Affine(b *testing.B) { -// var P, R [MAX_BATCH_SIZE]G2Affine -// var RR, PP [MAX_BATCH_SIZE]*G2Affine -// var ridx [MAX_BATCH_SIZE]int +func BenchmarkBatchAddG2Affine(b *testing.B) { -// fillBenchBasesG2(P[:]) -// fillBenchBasesG2(R[:]) + var P, R pG2AffineC16 + var RR ppG2AffineC16 + ridx := make([]int, len(P)) -// for i:=0; i < len(ridx);i++ { -// ridx[i] = i -// } + // TODO P == R may produce skewed benches + fillBenchBasesG2(P[:]) + fillBenchBasesG2(R[:]) -// // random permute -// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } -// for i, ri := range ridx { -// RR[i] = &R[ri] -// PP[i] = &P[ri] -// } + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) -// b.ResetTimer() -// for i := 0; i < b.N; i++ { -// batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) -// } + for i, ri := range ridx { + RR[i] = &R[ri] + } -// } + b.ResetTimer() + for i := 0; i < b.N; i++ { + batchAddG2Affine[pG2AffineC16, ppG2AffineC16, cG2AffineC16](&RR, &P, len(P)) + } +} func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bls12-378/g1_test.go b/ecc/bls12-378/g1_test.go index 6752818d29..70c9dc5603 100644 --- a/ecc/bls12-378/g1_test.go +++ b/ecc/bls12-378/g1_test.go @@ -19,6 +19,7 @@ package bls12378 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-378/fp" @@ -499,32 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } -// func BenchmarkBatchAddG1Affine(b *testing.B) { -// var P, R [MAX_BATCH_SIZE]G1Affine -// var RR, PP [MAX_BATCH_SIZE]*G1Affine -// var ridx [MAX_BATCH_SIZE]int +func BenchmarkBatchAddG1Affine(b *testing.B) { -// fillBenchBasesG1(P[:]) -// fillBenchBasesG1(R[:]) + var P, R pG1AffineC16 + var RR ppG1AffineC16 + ridx := make([]int, len(P)) -// for i:=0; i < len(ridx);i++ { -// ridx[i] = i -// } + // TODO P == R may produce skewed benches + fillBenchBasesG1(P[:]) + fillBenchBasesG1(R[:]) -// // random permute -// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } -// for i, ri := range ridx { -// RR[i] = &R[ri] -// PP[i] = &P[ri] -// } + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) -// b.ResetTimer() -// for i := 0; i < b.N; i++ { -// batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) -// } + for i, ri := range ridx { + RR[i] = &R[ri] + } -// } + b.ResetTimer() + for i := 0; i < b.N; i++ { + batchAddG1Affine[pG1AffineC16, ppG1AffineC16, cG1AffineC16](&RR, &P, len(P)) + } +} func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bls12-378/g2_test.go b/ecc/bls12-378/g2_test.go index a9632dc413..ef146f5247 100644 --- a/ecc/bls12-378/g2_test.go +++ b/ecc/bls12-378/g2_test.go @@ -19,6 +19,7 @@ package bls12378 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-378/internal/fptower" @@ -505,32 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } -// func BenchmarkBatchAddG2Affine(b *testing.B) { -// var P, R [MAX_BATCH_SIZE]G2Affine -// var RR, PP [MAX_BATCH_SIZE]*G2Affine -// var ridx [MAX_BATCH_SIZE]int +func BenchmarkBatchAddG2Affine(b *testing.B) { -// fillBenchBasesG2(P[:]) -// fillBenchBasesG2(R[:]) + var P, R pG2AffineC16 + var RR ppG2AffineC16 + ridx := make([]int, len(P)) -// for i:=0; i < len(ridx);i++ { -// ridx[i] = i -// } + // TODO P == R may produce skewed benches + fillBenchBasesG2(P[:]) + fillBenchBasesG2(R[:]) -// // random permute -// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } -// for i, ri := range ridx { -// RR[i] = &R[ri] -// PP[i] = &P[ri] -// } + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) -// b.ResetTimer() -// for i := 0; i < b.N; i++ { -// batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) -// } + for i, ri := range ridx { + RR[i] = &R[ri] + } -// } + b.ResetTimer() + for i := 0; i < b.N; i++ { + batchAddG2Affine[pG2AffineC16, ppG2AffineC16, cG2AffineC16](&RR, &P, len(P)) + } +} func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bls12-381/g1_test.go b/ecc/bls12-381/g1_test.go index 223c3763c0..ce531be7ae 100644 --- a/ecc/bls12-381/g1_test.go +++ b/ecc/bls12-381/g1_test.go @@ -19,6 +19,7 @@ package bls12381 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-381/fp" @@ -499,32 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } -// func BenchmarkBatchAddG1Affine(b *testing.B) { -// var P, R [MAX_BATCH_SIZE]G1Affine -// var RR, PP [MAX_BATCH_SIZE]*G1Affine -// var ridx [MAX_BATCH_SIZE]int +func BenchmarkBatchAddG1Affine(b *testing.B) { -// fillBenchBasesG1(P[:]) -// fillBenchBasesG1(R[:]) + var P, R pG1AffineC16 + var RR ppG1AffineC16 + ridx := make([]int, len(P)) -// for i:=0; i < len(ridx);i++ { -// ridx[i] = i -// } + // TODO P == R may produce skewed benches + fillBenchBasesG1(P[:]) + fillBenchBasesG1(R[:]) -// // random permute -// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } -// for i, ri := range ridx { -// RR[i] = &R[ri] -// PP[i] = &P[ri] -// } + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) -// b.ResetTimer() -// for i := 0; i < b.N; i++ { -// batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) -// } + for i, ri := range ridx { + RR[i] = &R[ri] + } -// } + b.ResetTimer() + for i := 0; i < b.N; i++ { + batchAddG1Affine[pG1AffineC16, ppG1AffineC16, cG1AffineC16](&RR, &P, len(P)) + } +} func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bls12-381/g2_test.go b/ecc/bls12-381/g2_test.go index be4957738e..27cccc9938 100644 --- a/ecc/bls12-381/g2_test.go +++ b/ecc/bls12-381/g2_test.go @@ -19,6 +19,7 @@ package bls12381 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-381/internal/fptower" @@ -505,32 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } -// func BenchmarkBatchAddG2Affine(b *testing.B) { -// var P, R [MAX_BATCH_SIZE]G2Affine -// var RR, PP [MAX_BATCH_SIZE]*G2Affine -// var ridx [MAX_BATCH_SIZE]int +func BenchmarkBatchAddG2Affine(b *testing.B) { -// fillBenchBasesG2(P[:]) -// fillBenchBasesG2(R[:]) + var P, R pG2AffineC16 + var RR ppG2AffineC16 + ridx := make([]int, len(P)) -// for i:=0; i < len(ridx);i++ { -// ridx[i] = i -// } + // TODO P == R may produce skewed benches + fillBenchBasesG2(P[:]) + fillBenchBasesG2(R[:]) -// // random permute -// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } -// for i, ri := range ridx { -// RR[i] = &R[ri] -// PP[i] = &P[ri] -// } + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) -// b.ResetTimer() -// for i := 0; i < b.N; i++ { -// batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) -// } + for i, ri := range ridx { + RR[i] = &R[ri] + } -// } + b.ResetTimer() + for i := 0; i < b.N; i++ { + batchAddG2Affine[pG2AffineC16, ppG2AffineC16, cG2AffineC16](&RR, &P, len(P)) + } +} func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bls24-315/g1_test.go b/ecc/bls24-315/g1_test.go index 4ffe3679c7..27054cb2fa 100644 --- a/ecc/bls24-315/g1_test.go +++ b/ecc/bls24-315/g1_test.go @@ -19,6 +19,7 @@ package bls24315 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls24-315/fp" @@ -499,32 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } -// func BenchmarkBatchAddG1Affine(b *testing.B) { -// var P, R [MAX_BATCH_SIZE]G1Affine -// var RR, PP [MAX_BATCH_SIZE]*G1Affine -// var ridx [MAX_BATCH_SIZE]int +func BenchmarkBatchAddG1Affine(b *testing.B) { -// fillBenchBasesG1(P[:]) -// fillBenchBasesG1(R[:]) + var P, R pG1AffineC16 + var RR ppG1AffineC16 + ridx := make([]int, len(P)) -// for i:=0; i < len(ridx);i++ { -// ridx[i] = i -// } + // TODO P == R may produce skewed benches + fillBenchBasesG1(P[:]) + fillBenchBasesG1(R[:]) -// // random permute -// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } -// for i, ri := range ridx { -// RR[i] = &R[ri] -// PP[i] = &P[ri] -// } + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) -// b.ResetTimer() -// for i := 0; i < b.N; i++ { -// batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) -// } + for i, ri := range ridx { + RR[i] = &R[ri] + } -// } + b.ResetTimer() + for i := 0; i < b.N; i++ { + batchAddG1Affine[pG1AffineC16, ppG1AffineC16, cG1AffineC16](&RR, &P, len(P)) + } +} func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bls24-315/g2_test.go b/ecc/bls24-315/g2_test.go index 019fa5ec24..d923c5a007 100644 --- a/ecc/bls24-315/g2_test.go +++ b/ecc/bls24-315/g2_test.go @@ -19,6 +19,7 @@ package bls24315 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls24-315/internal/fptower" @@ -505,32 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } -// func BenchmarkBatchAddG2Affine(b *testing.B) { -// var P, R [MAX_BATCH_SIZE]G2Affine -// var RR, PP [MAX_BATCH_SIZE]*G2Affine -// var ridx [MAX_BATCH_SIZE]int +func BenchmarkBatchAddG2Affine(b *testing.B) { -// fillBenchBasesG2(P[:]) -// fillBenchBasesG2(R[:]) + var P, R pG2AffineC16 + var RR ppG2AffineC16 + ridx := make([]int, len(P)) -// for i:=0; i < len(ridx);i++ { -// ridx[i] = i -// } + // TODO P == R may produce skewed benches + fillBenchBasesG2(P[:]) + fillBenchBasesG2(R[:]) -// // random permute -// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } -// for i, ri := range ridx { -// RR[i] = &R[ri] -// PP[i] = &P[ri] -// } + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) -// b.ResetTimer() -// for i := 0; i < b.N; i++ { -// batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) -// } + for i, ri := range ridx { + RR[i] = &R[ri] + } -// } + b.ResetTimer() + for i := 0; i < b.N; i++ { + batchAddG2Affine[pG2AffineC16, ppG2AffineC16, cG2AffineC16](&RR, &P, len(P)) + } +} func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bls24-317/g1_test.go b/ecc/bls24-317/g1_test.go index 3a89f924e5..fc26138b40 100644 --- a/ecc/bls24-317/g1_test.go +++ b/ecc/bls24-317/g1_test.go @@ -19,6 +19,7 @@ package bls24317 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls24-317/fp" @@ -499,32 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } -// func BenchmarkBatchAddG1Affine(b *testing.B) { -// var P, R [MAX_BATCH_SIZE]G1Affine -// var RR, PP [MAX_BATCH_SIZE]*G1Affine -// var ridx [MAX_BATCH_SIZE]int +func BenchmarkBatchAddG1Affine(b *testing.B) { -// fillBenchBasesG1(P[:]) -// fillBenchBasesG1(R[:]) + var P, R pG1AffineC16 + var RR ppG1AffineC16 + ridx := make([]int, len(P)) -// for i:=0; i < len(ridx);i++ { -// ridx[i] = i -// } + // TODO P == R may produce skewed benches + fillBenchBasesG1(P[:]) + fillBenchBasesG1(R[:]) -// // random permute -// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } -// for i, ri := range ridx { -// RR[i] = &R[ri] -// PP[i] = &P[ri] -// } + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) -// b.ResetTimer() -// for i := 0; i < b.N; i++ { -// batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) -// } + for i, ri := range ridx { + RR[i] = &R[ri] + } -// } + b.ResetTimer() + for i := 0; i < b.N; i++ { + batchAddG1Affine[pG1AffineC16, ppG1AffineC16, cG1AffineC16](&RR, &P, len(P)) + } +} func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bls24-317/g2_test.go b/ecc/bls24-317/g2_test.go index 1d7ed1f3ff..8068d382f0 100644 --- a/ecc/bls24-317/g2_test.go +++ b/ecc/bls24-317/g2_test.go @@ -19,6 +19,7 @@ package bls24317 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bls24-317/internal/fptower" @@ -505,32 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } -// func BenchmarkBatchAddG2Affine(b *testing.B) { -// var P, R [MAX_BATCH_SIZE]G2Affine -// var RR, PP [MAX_BATCH_SIZE]*G2Affine -// var ridx [MAX_BATCH_SIZE]int +func BenchmarkBatchAddG2Affine(b *testing.B) { -// fillBenchBasesG2(P[:]) -// fillBenchBasesG2(R[:]) + var P, R pG2AffineC16 + var RR ppG2AffineC16 + ridx := make([]int, len(P)) -// for i:=0; i < len(ridx);i++ { -// ridx[i] = i -// } + // TODO P == R may produce skewed benches + fillBenchBasesG2(P[:]) + fillBenchBasesG2(R[:]) -// // random permute -// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } -// for i, ri := range ridx { -// RR[i] = &R[ri] -// PP[i] = &P[ri] -// } + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) -// b.ResetTimer() -// for i := 0; i < b.N; i++ { -// batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) -// } + for i, ri := range ridx { + RR[i] = &R[ri] + } -// } + b.ResetTimer() + for i := 0; i < b.N; i++ { + batchAddG2Affine[pG2AffineC16, ppG2AffineC16, cG2AffineC16](&RR, &P, len(P)) + } +} func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bn254/g1_test.go b/ecc/bn254/g1_test.go index 2e1973a911..c62747066a 100644 --- a/ecc/bn254/g1_test.go +++ b/ecc/bn254/g1_test.go @@ -19,6 +19,7 @@ package bn254 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bn254/fp" @@ -460,32 +461,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } -// func BenchmarkBatchAddG1Affine(b *testing.B) { -// var P, R [MAX_BATCH_SIZE]G1Affine -// var RR, PP [MAX_BATCH_SIZE]*G1Affine -// var ridx [MAX_BATCH_SIZE]int +func BenchmarkBatchAddG1Affine(b *testing.B) { -// fillBenchBasesG1(P[:]) -// fillBenchBasesG1(R[:]) + var P, R pG1AffineC16 + var RR ppG1AffineC16 + ridx := make([]int, len(P)) -// for i:=0; i < len(ridx);i++ { -// ridx[i] = i -// } + // TODO P == R may produce skewed benches + fillBenchBasesG1(P[:]) + fillBenchBasesG1(R[:]) -// // random permute -// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } -// for i, ri := range ridx { -// RR[i] = &R[ri] -// PP[i] = &P[ri] -// } + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) -// b.ResetTimer() -// for i := 0; i < b.N; i++ { -// batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) -// } + for i, ri := range ridx { + RR[i] = &R[ri] + } -// } + b.ResetTimer() + for i := 0; i < b.N; i++ { + batchAddG1Affine[pG1AffineC16, ppG1AffineC16, cG1AffineC16](&RR, &P, len(P)) + } +} func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bn254/g2_test.go b/ecc/bn254/g2_test.go index ae107fea78..5b103f2c4f 100644 --- a/ecc/bn254/g2_test.go +++ b/ecc/bn254/g2_test.go @@ -19,6 +19,7 @@ package bn254 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bn254/internal/fptower" @@ -504,32 +505,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } -// func BenchmarkBatchAddG2Affine(b *testing.B) { -// var P, R [MAX_BATCH_SIZE]G2Affine -// var RR, PP [MAX_BATCH_SIZE]*G2Affine -// var ridx [MAX_BATCH_SIZE]int +func BenchmarkBatchAddG2Affine(b *testing.B) { -// fillBenchBasesG2(P[:]) -// fillBenchBasesG2(R[:]) + var P, R pG2AffineC16 + var RR ppG2AffineC16 + ridx := make([]int, len(P)) -// for i:=0; i < len(ridx);i++ { -// ridx[i] = i -// } + // TODO P == R may produce skewed benches + fillBenchBasesG2(P[:]) + fillBenchBasesG2(R[:]) -// // random permute -// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } -// for i, ri := range ridx { -// RR[i] = &R[ri] -// PP[i] = &P[ri] -// } + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) -// b.ResetTimer() -// for i := 0; i < b.N; i++ { -// batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) -// } + for i, ri := range ridx { + RR[i] = &R[ri] + } -// } + b.ResetTimer() + for i := 0; i < b.N; i++ { + batchAddG2Affine[pG2AffineC16, ppG2AffineC16, cG2AffineC16](&RR, &P, len(P)) + } +} func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bw6-633/g1_test.go b/ecc/bw6-633/g1_test.go index a2b6c273f6..afa183ba27 100644 --- a/ecc/bw6-633/g1_test.go +++ b/ecc/bw6-633/g1_test.go @@ -19,6 +19,7 @@ package bw6633 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-633/fp" @@ -499,32 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } -// func BenchmarkBatchAddG1Affine(b *testing.B) { -// var P, R [MAX_BATCH_SIZE]G1Affine -// var RR, PP [MAX_BATCH_SIZE]*G1Affine -// var ridx [MAX_BATCH_SIZE]int +func BenchmarkBatchAddG1Affine(b *testing.B) { -// fillBenchBasesG1(P[:]) -// fillBenchBasesG1(R[:]) + var P, R pG1AffineC16 + var RR ppG1AffineC16 + ridx := make([]int, len(P)) -// for i:=0; i < len(ridx);i++ { -// ridx[i] = i -// } + // TODO P == R may produce skewed benches + fillBenchBasesG1(P[:]) + fillBenchBasesG1(R[:]) -// // random permute -// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } -// for i, ri := range ridx { -// RR[i] = &R[ri] -// PP[i] = &P[ri] -// } + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) -// b.ResetTimer() -// for i := 0; i < b.N; i++ { -// batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) -// } + for i, ri := range ridx { + RR[i] = &R[ri] + } -// } + b.ResetTimer() + for i := 0; i < b.N; i++ { + batchAddG1Affine[pG1AffineC16, ppG1AffineC16, cG1AffineC16](&RR, &P, len(P)) + } +} func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bw6-633/g2_test.go b/ecc/bw6-633/g2_test.go index f5c4d5edca..cb8469886c 100644 --- a/ecc/bw6-633/g2_test.go +++ b/ecc/bw6-633/g2_test.go @@ -19,6 +19,7 @@ package bw6633 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-633/fp" @@ -486,32 +487,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } -// func BenchmarkBatchAddG2Affine(b *testing.B) { -// var P, R [MAX_BATCH_SIZE]G2Affine -// var RR, PP [MAX_BATCH_SIZE]*G2Affine -// var ridx [MAX_BATCH_SIZE]int +func BenchmarkBatchAddG2Affine(b *testing.B) { -// fillBenchBasesG2(P[:]) -// fillBenchBasesG2(R[:]) + var P, R pG2AffineC16 + var RR ppG2AffineC16 + ridx := make([]int, len(P)) -// for i:=0; i < len(ridx);i++ { -// ridx[i] = i -// } + // TODO P == R may produce skewed benches + fillBenchBasesG2(P[:]) + fillBenchBasesG2(R[:]) -// // random permute -// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } -// for i, ri := range ridx { -// RR[i] = &R[ri] -// PP[i] = &P[ri] -// } + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) -// b.ResetTimer() -// for i := 0; i < b.N; i++ { -// batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) -// } + for i, ri := range ridx { + RR[i] = &R[ri] + } -// } + b.ResetTimer() + for i := 0; i < b.N; i++ { + batchAddG2Affine[pG2AffineC16, ppG2AffineC16, cG2AffineC16](&RR, &P, len(P)) + } +} func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bw6-756/g1_test.go b/ecc/bw6-756/g1_test.go index bd7a65f693..729fc7dfd6 100644 --- a/ecc/bw6-756/g1_test.go +++ b/ecc/bw6-756/g1_test.go @@ -19,6 +19,7 @@ package bw6756 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-756/fp" @@ -499,32 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } -// func BenchmarkBatchAddG1Affine(b *testing.B) { -// var P, R [MAX_BATCH_SIZE]G1Affine -// var RR, PP [MAX_BATCH_SIZE]*G1Affine -// var ridx [MAX_BATCH_SIZE]int +func BenchmarkBatchAddG1Affine(b *testing.B) { -// fillBenchBasesG1(P[:]) -// fillBenchBasesG1(R[:]) + var P, R pG1AffineC16 + var RR ppG1AffineC16 + ridx := make([]int, len(P)) -// for i:=0; i < len(ridx);i++ { -// ridx[i] = i -// } + // TODO P == R may produce skewed benches + fillBenchBasesG1(P[:]) + fillBenchBasesG1(R[:]) -// // random permute -// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } -// for i, ri := range ridx { -// RR[i] = &R[ri] -// PP[i] = &P[ri] -// } + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) -// b.ResetTimer() -// for i := 0; i < b.N; i++ { -// batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) -// } + for i, ri := range ridx { + RR[i] = &R[ri] + } -// } + b.ResetTimer() + for i := 0; i < b.N; i++ { + batchAddG1Affine[pG1AffineC16, ppG1AffineC16, cG1AffineC16](&RR, &P, len(P)) + } +} func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bw6-756/g2_test.go b/ecc/bw6-756/g2_test.go index 7d98c06668..95bd4e2312 100644 --- a/ecc/bw6-756/g2_test.go +++ b/ecc/bw6-756/g2_test.go @@ -19,6 +19,7 @@ package bw6756 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-756/fp" @@ -486,32 +487,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } -// func BenchmarkBatchAddG2Affine(b *testing.B) { -// var P, R [MAX_BATCH_SIZE]G2Affine -// var RR, PP [MAX_BATCH_SIZE]*G2Affine -// var ridx [MAX_BATCH_SIZE]int +func BenchmarkBatchAddG2Affine(b *testing.B) { -// fillBenchBasesG2(P[:]) -// fillBenchBasesG2(R[:]) + var P, R pG2AffineC16 + var RR ppG2AffineC16 + ridx := make([]int, len(P)) -// for i:=0; i < len(ridx);i++ { -// ridx[i] = i -// } + // TODO P == R may produce skewed benches + fillBenchBasesG2(P[:]) + fillBenchBasesG2(R[:]) -// // random permute -// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } -// for i, ri := range ridx { -// RR[i] = &R[ri] -// PP[i] = &P[ri] -// } + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) -// b.ResetTimer() -// for i := 0; i < b.N; i++ { -// batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) -// } + for i, ri := range ridx { + RR[i] = &R[ri] + } -// } + b.ResetTimer() + for i := 0; i < b.N; i++ { + batchAddG2Affine[pG2AffineC16, ppG2AffineC16, cG2AffineC16](&RR, &P, len(P)) + } +} func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bw6-761/g1_test.go b/ecc/bw6-761/g1_test.go index 4cbc725f60..f679f9719c 100644 --- a/ecc/bw6-761/g1_test.go +++ b/ecc/bw6-761/g1_test.go @@ -19,6 +19,7 @@ package bw6761 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-761/fp" @@ -499,32 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) { } -// func BenchmarkBatchAddG1Affine(b *testing.B) { -// var P, R [MAX_BATCH_SIZE]G1Affine -// var RR, PP [MAX_BATCH_SIZE]*G1Affine -// var ridx [MAX_BATCH_SIZE]int +func BenchmarkBatchAddG1Affine(b *testing.B) { -// fillBenchBasesG1(P[:]) -// fillBenchBasesG1(R[:]) + var P, R pG1AffineC16 + var RR ppG1AffineC16 + ridx := make([]int, len(P)) -// for i:=0; i < len(ridx);i++ { -// ridx[i] = i -// } + // TODO P == R may produce skewed benches + fillBenchBasesG1(P[:]) + fillBenchBasesG1(R[:]) -// // random permute -// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } -// for i, ri := range ridx { -// RR[i] = &R[ri] -// PP[i] = &P[ri] -// } + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) -// b.ResetTimer() -// for i := 0; i < b.N; i++ { -// batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) -// } + for i, ri := range ridx { + RR[i] = &R[ri] + } -// } + b.ResetTimer() + for i := 0; i < b.N; i++ { + batchAddG1Affine[pG1AffineC16, ppG1AffineC16, cG1AffineC16](&RR, &P, len(P)) + } +} func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/ecc/bw6-761/g2_test.go b/ecc/bw6-761/g2_test.go index 7fa415d6a5..a0bd87be5e 100644 --- a/ecc/bw6-761/g2_test.go +++ b/ecc/bw6-761/g2_test.go @@ -19,6 +19,7 @@ package bw6761 import ( "fmt" "math/big" + "math/rand" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-761/fp" @@ -486,32 +487,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) { } -// func BenchmarkBatchAddG2Affine(b *testing.B) { -// var P, R [MAX_BATCH_SIZE]G2Affine -// var RR, PP [MAX_BATCH_SIZE]*G2Affine -// var ridx [MAX_BATCH_SIZE]int +func BenchmarkBatchAddG2Affine(b *testing.B) { -// fillBenchBasesG2(P[:]) -// fillBenchBasesG2(R[:]) + var P, R pG2AffineC16 + var RR ppG2AffineC16 + ridx := make([]int, len(P)) -// for i:=0; i < len(ridx);i++ { -// ridx[i] = i -// } + // TODO P == R may produce skewed benches + fillBenchBasesG2(P[:]) + fillBenchBasesG2(R[:]) -// // random permute -// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + for i := 0; i < len(ridx); i++ { + ridx[i] = i + } -// for i, ri := range ridx { -// RR[i] = &R[ri] -// PP[i] = &P[ri] -// } + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) -// b.ResetTimer() -// for i := 0; i < b.N; i++ { -// batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) -// } + for i, ri := range ridx { + RR[i] = &R[ri] + } -// } + b.ResetTimer() + for i := 0; i < b.N; i++ { + batchAddG2Affine[pG2AffineC16, ppG2AffineC16, cG2AffineC16](&RR, &P, len(P)) + } +} func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled diff --git a/internal/generator/ecc/template/tests/point.go.tmpl b/internal/generator/ecc/template/tests/point.go.tmpl index 223bfbe040..1f342412b8 100644 --- a/internal/generator/ecc/template/tests/point.go.tmpl +++ b/internal/generator/ecc/template/tests/point.go.tmpl @@ -16,6 +16,7 @@ import ( "fmt" "math/big" "testing" + "math/rand" {{if or (eq .CoordType "fptower.E2") (eq .CoordType "fptower.E4")}} "github.com/consensys/gnark-crypto/ecc/{{.Name}}/internal/fptower" @@ -559,32 +560,32 @@ func Benchmark{{ $TJacobian }}IsInSubGroup(b *testing.B) { } -// func BenchmarkBatchAdd{{ $TAffine }}(b *testing.B) { -// var P, R [MAX_BATCH_SIZE]{{ $TAffine }} -// var RR, PP [MAX_BATCH_SIZE]*{{ $TAffine }} -// var ridx [MAX_BATCH_SIZE]int +func BenchmarkBatchAdd{{ $TAffine }}(b *testing.B) { + {{$c := 16}} + var P, R p{{$TAffine}}C{{$c}} + var RR pp{{$TAffine}}C{{$c}} + ridx := make([]int, len(P)) -// fillBenchBases{{ toUpper $.PointName }}(P[:]) -// fillBenchBases{{ toUpper $.PointName }}(R[:]) + // TODO P == R may produce skewed benches + fillBenchBases{{ toUpper $.PointName }}(P[:]) + fillBenchBases{{ toUpper $.PointName }}(R[:]) -// for i:=0; i < len(ridx);i++ { -// ridx[i] = i -// } - -// // random permute -// rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) + for i:=0; i < len(ridx);i++ { + ridx[i] = i + } -// for i, ri := range ridx { -// RR[i] = &R[ri] -// PP[i] = &P[ri] -// } + // random permute + rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] }) -// b.ResetTimer() -// for i := 0; i < b.N; i++ { -// batchAdd{{ $TAffine }}(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2) -// } + for i, ri := range ridx { + RR[i] = &R[ri] + } -// } + b.ResetTimer() + for i := 0; i < b.N; i++ { + batchAdd{{ $TAffine }}[p{{$TAffine}}C{{$c}}, pp{{$TAffine}}C{{$c}}, c{{$TAffine}}C{{$c}}](&RR, &P, len(P)) + } +} func Benchmark{{ $TAffine }}BatchScalarMultiplication(b *testing.B) { // ensure every words of the scalars are filled From 6049e2ff393a914d1ba1e3377c9a5b200d3c3835 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Wed, 16 Nov 2022 15:13:05 -0600 Subject: [PATCH 35/43] bug: bug when c==1 msm ext jac incorrect --- ecc/bls12-377/multiexp.go | 4 ++++ ecc/bls12-377/multiexp_test.go | 6 +++++- ecc/bls12-378/multiexp.go | 4 ++++ ecc/bls12-378/multiexp_test.go | 4 ++++ ecc/bls12-381/multiexp.go | 4 ++++ ecc/bls12-381/multiexp_test.go | 6 +++++- ecc/bls24-315/multiexp.go | 4 ++++ ecc/bls24-315/multiexp_test.go | 6 +++++- ecc/bls24-317/multiexp.go | 4 ++++ ecc/bls24-317/multiexp_test.go | 6 +++++- ecc/bn254/multiexp.go | 4 ++++ ecc/bn254/multiexp_test.go | 4 ++++ ecc/bw6-633/multiexp.go | 4 ++++ ecc/bw6-633/multiexp_test.go | 6 +++++- ecc/bw6-756/multiexp.go | 4 ++++ ecc/bw6-756/multiexp_test.go | 4 ++++ ecc/bw6-761/multiexp.go | 4 ++++ ecc/bw6-761/multiexp_test.go | 4 ++++ internal/generator/ecc/template/multiexp.go.tmpl | 2 ++ internal/generator/ecc/template/tests/multiexp.go.tmpl | 4 +++- 20 files changed, 82 insertions(+), 6 deletions(-) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index 7e519245cc..00f3a97050 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -275,12 +275,14 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } @@ -534,12 +536,14 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index 4510933ea1..1eb820de13 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -100,7 +100,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} + cRange := []uint64{2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -131,6 +131,7 @@ func TestMultiExpG1(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -170,6 +171,7 @@ func TestMultiExpG1(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -449,6 +451,7 @@ func TestMultiExpG2(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -488,6 +491,7 @@ func TestMultiExpG2(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index 63476d87c9..133242f8af 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -275,12 +275,14 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } @@ -534,12 +536,14 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index 5bef450dc0..ab0f36f2ec 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -131,6 +131,7 @@ func TestMultiExpG1(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -170,6 +171,7 @@ func TestMultiExpG1(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -449,6 +451,7 @@ func TestMultiExpG2(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -488,6 +491,7 @@ func TestMultiExpG2(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index 4a389f9e4f..60e4686759 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -275,12 +275,14 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } @@ -534,12 +536,14 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index fdd7809b56..aa5814cf6e 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -100,7 +100,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} + cRange := []uint64{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -131,6 +131,7 @@ func TestMultiExpG1(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -170,6 +171,7 @@ func TestMultiExpG1(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -449,6 +451,7 @@ func TestMultiExpG2(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -488,6 +491,7 @@ func TestMultiExpG2(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index 9b3b5eb0d8..85c2a14d17 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -275,12 +275,14 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } @@ -534,12 +536,14 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index bdb2e0167a..9787f4f172 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -100,7 +100,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} + cRange := []uint64{2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -131,6 +131,7 @@ func TestMultiExpG1(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -170,6 +171,7 @@ func TestMultiExpG1(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -449,6 +451,7 @@ func TestMultiExpG2(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -488,6 +491,7 @@ func TestMultiExpG2(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index 6b98c52875..733358396c 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -275,12 +275,14 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } @@ -534,12 +536,14 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index 48420037d0..54fbbb031f 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -100,7 +100,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} + cRange := []uint64{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -131,6 +131,7 @@ func TestMultiExpG1(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -170,6 +171,7 @@ func TestMultiExpG1(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -449,6 +451,7 @@ func TestMultiExpG2(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -488,6 +491,7 @@ func TestMultiExpG2(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index 791fc0c19e..d373d1683e 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -275,12 +275,14 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } @@ -534,12 +536,14 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index 00cd01348e..a6cf0fbe50 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -131,6 +131,7 @@ func TestMultiExpG1(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -170,6 +171,7 @@ func TestMultiExpG1(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -449,6 +451,7 @@ func TestMultiExpG2(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -488,6 +491,7 @@ func TestMultiExpG2(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index eeb7d4e43a..dbbd344b08 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -222,12 +222,14 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } @@ -428,12 +430,14 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index 6572865155..8e5c73a1e6 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -100,7 +100,7 @@ func TestMultiExpG1(t *testing.T) { )) // cRange is generated from template and contains the available parameters for the multiexp window size - cRange := []uint64{1, 4, 5, 8, 12, 16} + cRange := []uint64{4, 5, 8, 12, 16} if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) cRange = []uint64{5, 16} @@ -131,6 +131,7 @@ func TestMultiExpG1(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -170,6 +171,7 @@ func TestMultiExpG1(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -449,6 +451,7 @@ func TestMultiExpG2(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -488,6 +491,7 @@ func TestMultiExpG2(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index 35135f2959..83b43d9a33 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -222,12 +222,14 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } @@ -428,12 +430,14 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index f6d3a94ca9..3daeb64fca 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -131,6 +131,7 @@ func TestMultiExpG1(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -170,6 +171,7 @@ func TestMultiExpG1(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -449,6 +451,7 @@ func TestMultiExpG2(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -488,6 +491,7 @@ func TestMultiExpG2(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index 5037c2e0be..a0d2a19620 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -224,12 +224,14 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } @@ -432,12 +434,14 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index 83f5b41bbd..8520cc7435 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -131,6 +131,7 @@ func TestMultiExpG1(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -170,6 +171,7 @@ func TestMultiExpG1(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -449,6 +451,7 @@ func TestMultiExpG2(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } @@ -488,6 +491,7 @@ func TestMultiExpG2(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) return false } } diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index 13e0171a3c..5f3d40e9b5 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -541,12 +541,14 @@ func msmReduceChunk{{ $.TAffine }}(p *{{ $.TJacobian }}, c int, chChunks []chan _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } + // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index 5079d33d3b..08c7acd6ff 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -101,7 +101,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { // cRange is generated from template and contains the available parameters for the multiexp window size {{- if eq $.PointName "g1" }} cRange := []uint64{ - {{- range $c := $.CRange}}{{$c}},{{- end}} + {{- range $c := $.CRange}}{{- if gt $c 1}}{{$c}},{{- end}}{{- end}} } if testing.Short() { // test only "odd" and "even" (ie windows size divide word size vs not) @@ -139,6 +139,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { } for i:=1; i < len(results);i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1],cRange[i]) return false } } @@ -178,6 +179,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { } for i := 1; i < len(results); i++ { if !results[i].Equal(&results[i-1]) { + t.Logf("result for c=%d != c=%d", cRange[i-1],cRange[i]) return false } } From 3133efdc7178e9f8f898a4a268d82c65f39e7387 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Wed, 16 Nov 2022 15:33:08 -0600 Subject: [PATCH 36/43] test: added cross msm tests --- ecc/bls12-377/multiexp_test.go | 210 ++++++++++++++++-- ecc/bls12-378/multiexp_test.go | 210 ++++++++++++++++-- ecc/bls12-381/multiexp_test.go | 210 ++++++++++++++++-- ecc/bls24-315/multiexp_test.go | 210 ++++++++++++++++-- ecc/bls24-317/multiexp_test.go | 210 ++++++++++++++++-- ecc/bn254/multiexp_test.go | 210 ++++++++++++++++-- ecc/bw6-633/multiexp_test.go | 210 ++++++++++++++++-- ecc/bw6-756/multiexp_test.go | 210 ++++++++++++++++-- ecc/bw6-761/multiexp_test.go | 210 ++++++++++++++++-- .../ecc/template/tests/multiexp.go.tmpl | 140 ++++++++++-- 10 files changed, 1885 insertions(+), 145 deletions(-) diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index 1eb820de13..f2487e2edc 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -144,7 +144,6 @@ func TestMultiExpG1(t *testing.T) { func(mixer fr.Element) bool { var samplePointsZero [nbSamples]G1Affine - copy(samplePointsZero[:], samplePoints[:]) var expected G1Jac @@ -160,18 +159,36 @@ func TestMultiExpG1(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() - if i%10 == 0 { - samplePointsZero[i].setInfinity() - } + samplePointsZero[i-1].setInfinity() } results := make([]G1Jac, len(cRange)) for i, c := range cRange { _innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } - for i := 1; i < len(results); i++ { - if !results[i].Equal(&results[i-1]) { - t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) + return false + } + } + return true + }, + genScalar, + )) + + properties.Property(fmt.Sprintf("[G1] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + results := make([]G1Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) return false } } @@ -218,6 +235,81 @@ func TestMultiExpG1(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestCrossMultiExpG1(t *testing.T) { + const nbSamples = 1 << 14 + // multi exp points + var samplePoints [nbSamples]G1Affine + var g G1Jac + g.Set(&g1Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g1Gen) + } + + // sprinkle some points at infinity + rand.Seed(time.Now().UnixNano()) + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + + var sampleScalars [nbSamples]fr.Element + fillBenchScalars(sampleScalars[:]) + + // cRange is generated from template and contains the available parameters for the multiexp window size + cRange := []uint64{2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} + if testing.Short() { + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange = []uint64{5, 16} + } + + results := make([]G1Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + + var r G1Jac + _innerMsmG1Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + + var expected, got G1Affine + expected.FromJacobian(&r) + + for i := 0; i < len(results); i++ { + got.FromJacobian(&results[i]) + if !expected.Equal(&got) { + t.Fatalf("cross msm failed with c=%d", cRange[i]) + } + } + +} + +// _innerMsmG1Reference always do ext jacobian with c == 16 +func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { + // partition the scalars + digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks) + + nbChunks := computeNbChunks(16) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g1JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16] + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG1Affine(p, int(16), chChunks[:]) +} + func BenchmarkMultiExpG1(b *testing.B) { const ( @@ -464,7 +556,6 @@ func TestMultiExpG2(t *testing.T) { func(mixer fr.Element) bool { var samplePointsZero [nbSamples]G2Affine - copy(samplePointsZero[:], samplePoints[:]) var expected G2Jac @@ -480,18 +571,36 @@ func TestMultiExpG2(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() - if i%10 == 0 { - samplePointsZero[i].setInfinity() - } + samplePointsZero[i-1].setInfinity() } results := make([]G2Jac, len(cRange)) for i, c := range cRange { _innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } - for i := 1; i < len(results); i++ { - if !results[i].Equal(&results[i-1]) { - t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) + return false + } + } + return true + }, + genScalar, + )) + + properties.Property(fmt.Sprintf("[G2] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + results := make([]G2Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) return false } } @@ -538,6 +647,79 @@ func TestMultiExpG2(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestCrossMultiExpG2(t *testing.T) { + const nbSamples = 1 << 14 + // multi exp points + var samplePoints [nbSamples]G2Affine + var g G2Jac + g.Set(&g2Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g2Gen) + } + + // sprinkle some points at infinity + rand.Seed(time.Now().UnixNano()) + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + + var sampleScalars [nbSamples]fr.Element + fillBenchScalars(sampleScalars[:]) + + // cRange is generated from template and contains the available parameters for the multiexp window size + // for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets. + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange := []uint64{5, 16} + + results := make([]G2Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + + var r G2Jac + _innerMsmG2Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + + var expected, got G2Affine + expected.FromJacobian(&r) + + for i := 0; i < len(results); i++ { + got.FromJacobian(&results[i]) + if !expected.Equal(&got) { + t.Fatalf("cross msm failed with c=%d", cRange[i]) + } + } + +} + +// _innerMsmG2Reference always do ext jacobian with c == 16 +func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { + // partition the scalars + digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks) + + nbChunks := computeNbChunks(16) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16] + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG2Affine(p, int(16), chChunks[:]) +} + func BenchmarkMultiExpG2(b *testing.B) { const ( diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index ab0f36f2ec..55524da71e 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -144,7 +144,6 @@ func TestMultiExpG1(t *testing.T) { func(mixer fr.Element) bool { var samplePointsZero [nbSamples]G1Affine - copy(samplePointsZero[:], samplePoints[:]) var expected G1Jac @@ -160,18 +159,36 @@ func TestMultiExpG1(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() - if i%10 == 0 { - samplePointsZero[i].setInfinity() - } + samplePointsZero[i-1].setInfinity() } results := make([]G1Jac, len(cRange)) for i, c := range cRange { _innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } - for i := 1; i < len(results); i++ { - if !results[i].Equal(&results[i-1]) { - t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) + return false + } + } + return true + }, + genScalar, + )) + + properties.Property(fmt.Sprintf("[G1] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + results := make([]G1Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) return false } } @@ -218,6 +235,81 @@ func TestMultiExpG1(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestCrossMultiExpG1(t *testing.T) { + const nbSamples = 1 << 14 + // multi exp points + var samplePoints [nbSamples]G1Affine + var g G1Jac + g.Set(&g1Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g1Gen) + } + + // sprinkle some points at infinity + rand.Seed(time.Now().UnixNano()) + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + + var sampleScalars [nbSamples]fr.Element + fillBenchScalars(sampleScalars[:]) + + // cRange is generated from template and contains the available parameters for the multiexp window size + cRange := []uint64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} + if testing.Short() { + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange = []uint64{5, 16} + } + + results := make([]G1Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + + var r G1Jac + _innerMsmG1Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + + var expected, got G1Affine + expected.FromJacobian(&r) + + for i := 0; i < len(results); i++ { + got.FromJacobian(&results[i]) + if !expected.Equal(&got) { + t.Fatalf("cross msm failed with c=%d", cRange[i]) + } + } + +} + +// _innerMsmG1Reference always do ext jacobian with c == 16 +func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { + // partition the scalars + digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks) + + nbChunks := computeNbChunks(16) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g1JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16] + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG1Affine(p, int(16), chChunks[:]) +} + func BenchmarkMultiExpG1(b *testing.B) { const ( @@ -464,7 +556,6 @@ func TestMultiExpG2(t *testing.T) { func(mixer fr.Element) bool { var samplePointsZero [nbSamples]G2Affine - copy(samplePointsZero[:], samplePoints[:]) var expected G2Jac @@ -480,18 +571,36 @@ func TestMultiExpG2(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() - if i%10 == 0 { - samplePointsZero[i].setInfinity() - } + samplePointsZero[i-1].setInfinity() } results := make([]G2Jac, len(cRange)) for i, c := range cRange { _innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } - for i := 1; i < len(results); i++ { - if !results[i].Equal(&results[i-1]) { - t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) + return false + } + } + return true + }, + genScalar, + )) + + properties.Property(fmt.Sprintf("[G2] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + results := make([]G2Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) return false } } @@ -538,6 +647,79 @@ func TestMultiExpG2(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestCrossMultiExpG2(t *testing.T) { + const nbSamples = 1 << 14 + // multi exp points + var samplePoints [nbSamples]G2Affine + var g G2Jac + g.Set(&g2Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g2Gen) + } + + // sprinkle some points at infinity + rand.Seed(time.Now().UnixNano()) + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + + var sampleScalars [nbSamples]fr.Element + fillBenchScalars(sampleScalars[:]) + + // cRange is generated from template and contains the available parameters for the multiexp window size + // for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets. + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange := []uint64{5, 16} + + results := make([]G2Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + + var r G2Jac + _innerMsmG2Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + + var expected, got G2Affine + expected.FromJacobian(&r) + + for i := 0; i < len(results); i++ { + got.FromJacobian(&results[i]) + if !expected.Equal(&got) { + t.Fatalf("cross msm failed with c=%d", cRange[i]) + } + } + +} + +// _innerMsmG2Reference always do ext jacobian with c == 16 +func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { + // partition the scalars + digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks) + + nbChunks := computeNbChunks(16) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16] + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG2Affine(p, int(16), chChunks[:]) +} + func BenchmarkMultiExpG2(b *testing.B) { const ( diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index aa5814cf6e..8d96b5c59e 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -144,7 +144,6 @@ func TestMultiExpG1(t *testing.T) { func(mixer fr.Element) bool { var samplePointsZero [nbSamples]G1Affine - copy(samplePointsZero[:], samplePoints[:]) var expected G1Jac @@ -160,18 +159,36 @@ func TestMultiExpG1(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() - if i%10 == 0 { - samplePointsZero[i].setInfinity() - } + samplePointsZero[i-1].setInfinity() } results := make([]G1Jac, len(cRange)) for i, c := range cRange { _innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } - for i := 1; i < len(results); i++ { - if !results[i].Equal(&results[i-1]) { - t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) + return false + } + } + return true + }, + genScalar, + )) + + properties.Property(fmt.Sprintf("[G1] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + results := make([]G1Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) return false } } @@ -218,6 +235,81 @@ func TestMultiExpG1(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestCrossMultiExpG1(t *testing.T) { + const nbSamples = 1 << 14 + // multi exp points + var samplePoints [nbSamples]G1Affine + var g G1Jac + g.Set(&g1Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g1Gen) + } + + // sprinkle some points at infinity + rand.Seed(time.Now().UnixNano()) + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + + var sampleScalars [nbSamples]fr.Element + fillBenchScalars(sampleScalars[:]) + + // cRange is generated from template and contains the available parameters for the multiexp window size + cRange := []uint64{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} + if testing.Short() { + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange = []uint64{5, 16} + } + + results := make([]G1Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + + var r G1Jac + _innerMsmG1Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + + var expected, got G1Affine + expected.FromJacobian(&r) + + for i := 0; i < len(results); i++ { + got.FromJacobian(&results[i]) + if !expected.Equal(&got) { + t.Fatalf("cross msm failed with c=%d", cRange[i]) + } + } + +} + +// _innerMsmG1Reference always do ext jacobian with c == 16 +func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { + // partition the scalars + digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks) + + nbChunks := computeNbChunks(16) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g1JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16] + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG1Affine(p, int(16), chChunks[:]) +} + func BenchmarkMultiExpG1(b *testing.B) { const ( @@ -464,7 +556,6 @@ func TestMultiExpG2(t *testing.T) { func(mixer fr.Element) bool { var samplePointsZero [nbSamples]G2Affine - copy(samplePointsZero[:], samplePoints[:]) var expected G2Jac @@ -480,18 +571,36 @@ func TestMultiExpG2(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() - if i%10 == 0 { - samplePointsZero[i].setInfinity() - } + samplePointsZero[i-1].setInfinity() } results := make([]G2Jac, len(cRange)) for i, c := range cRange { _innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } - for i := 1; i < len(results); i++ { - if !results[i].Equal(&results[i-1]) { - t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) + return false + } + } + return true + }, + genScalar, + )) + + properties.Property(fmt.Sprintf("[G2] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + results := make([]G2Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) return false } } @@ -538,6 +647,79 @@ func TestMultiExpG2(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestCrossMultiExpG2(t *testing.T) { + const nbSamples = 1 << 14 + // multi exp points + var samplePoints [nbSamples]G2Affine + var g G2Jac + g.Set(&g2Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g2Gen) + } + + // sprinkle some points at infinity + rand.Seed(time.Now().UnixNano()) + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + + var sampleScalars [nbSamples]fr.Element + fillBenchScalars(sampleScalars[:]) + + // cRange is generated from template and contains the available parameters for the multiexp window size + // for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets. + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange := []uint64{5, 16} + + results := make([]G2Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + + var r G2Jac + _innerMsmG2Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + + var expected, got G2Affine + expected.FromJacobian(&r) + + for i := 0; i < len(results); i++ { + got.FromJacobian(&results[i]) + if !expected.Equal(&got) { + t.Fatalf("cross msm failed with c=%d", cRange[i]) + } + } + +} + +// _innerMsmG2Reference always do ext jacobian with c == 16 +func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { + // partition the scalars + digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks) + + nbChunks := computeNbChunks(16) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16] + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG2Affine(p, int(16), chChunks[:]) +} + func BenchmarkMultiExpG2(b *testing.B) { const ( diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index 9787f4f172..4e67c67761 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -144,7 +144,6 @@ func TestMultiExpG1(t *testing.T) { func(mixer fr.Element) bool { var samplePointsZero [nbSamples]G1Affine - copy(samplePointsZero[:], samplePoints[:]) var expected G1Jac @@ -160,18 +159,36 @@ func TestMultiExpG1(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() - if i%10 == 0 { - samplePointsZero[i].setInfinity() - } + samplePointsZero[i-1].setInfinity() } results := make([]G1Jac, len(cRange)) for i, c := range cRange { _innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } - for i := 1; i < len(results); i++ { - if !results[i].Equal(&results[i-1]) { - t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) + return false + } + } + return true + }, + genScalar, + )) + + properties.Property(fmt.Sprintf("[G1] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + results := make([]G1Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) return false } } @@ -218,6 +235,81 @@ func TestMultiExpG1(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestCrossMultiExpG1(t *testing.T) { + const nbSamples = 1 << 14 + // multi exp points + var samplePoints [nbSamples]G1Affine + var g G1Jac + g.Set(&g1Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g1Gen) + } + + // sprinkle some points at infinity + rand.Seed(time.Now().UnixNano()) + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + + var sampleScalars [nbSamples]fr.Element + fillBenchScalars(sampleScalars[:]) + + // cRange is generated from template and contains the available parameters for the multiexp window size + cRange := []uint64{2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} + if testing.Short() { + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange = []uint64{5, 16} + } + + results := make([]G1Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + + var r G1Jac + _innerMsmG1Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + + var expected, got G1Affine + expected.FromJacobian(&r) + + for i := 0; i < len(results); i++ { + got.FromJacobian(&results[i]) + if !expected.Equal(&got) { + t.Fatalf("cross msm failed with c=%d", cRange[i]) + } + } + +} + +// _innerMsmG1Reference always do ext jacobian with c == 16 +func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { + // partition the scalars + digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks) + + nbChunks := computeNbChunks(16) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g1JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16] + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG1Affine(p, int(16), chChunks[:]) +} + func BenchmarkMultiExpG1(b *testing.B) { const ( @@ -464,7 +556,6 @@ func TestMultiExpG2(t *testing.T) { func(mixer fr.Element) bool { var samplePointsZero [nbSamples]G2Affine - copy(samplePointsZero[:], samplePoints[:]) var expected G2Jac @@ -480,18 +571,36 @@ func TestMultiExpG2(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() - if i%10 == 0 { - samplePointsZero[i].setInfinity() - } + samplePointsZero[i-1].setInfinity() } results := make([]G2Jac, len(cRange)) for i, c := range cRange { _innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } - for i := 1; i < len(results); i++ { - if !results[i].Equal(&results[i-1]) { - t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) + return false + } + } + return true + }, + genScalar, + )) + + properties.Property(fmt.Sprintf("[G2] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + results := make([]G2Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) return false } } @@ -538,6 +647,79 @@ func TestMultiExpG2(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestCrossMultiExpG2(t *testing.T) { + const nbSamples = 1 << 14 + // multi exp points + var samplePoints [nbSamples]G2Affine + var g G2Jac + g.Set(&g2Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g2Gen) + } + + // sprinkle some points at infinity + rand.Seed(time.Now().UnixNano()) + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + + var sampleScalars [nbSamples]fr.Element + fillBenchScalars(sampleScalars[:]) + + // cRange is generated from template and contains the available parameters for the multiexp window size + // for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets. + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange := []uint64{5, 16} + + results := make([]G2Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + + var r G2Jac + _innerMsmG2Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + + var expected, got G2Affine + expected.FromJacobian(&r) + + for i := 0; i < len(results); i++ { + got.FromJacobian(&results[i]) + if !expected.Equal(&got) { + t.Fatalf("cross msm failed with c=%d", cRange[i]) + } + } + +} + +// _innerMsmG2Reference always do ext jacobian with c == 16 +func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { + // partition the scalars + digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks) + + nbChunks := computeNbChunks(16) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16] + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG2Affine(p, int(16), chChunks[:]) +} + func BenchmarkMultiExpG2(b *testing.B) { const ( diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index 54fbbb031f..33e7c834c5 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -144,7 +144,6 @@ func TestMultiExpG1(t *testing.T) { func(mixer fr.Element) bool { var samplePointsZero [nbSamples]G1Affine - copy(samplePointsZero[:], samplePoints[:]) var expected G1Jac @@ -160,18 +159,36 @@ func TestMultiExpG1(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() - if i%10 == 0 { - samplePointsZero[i].setInfinity() - } + samplePointsZero[i-1].setInfinity() } results := make([]G1Jac, len(cRange)) for i, c := range cRange { _innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } - for i := 1; i < len(results); i++ { - if !results[i].Equal(&results[i-1]) { - t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) + return false + } + } + return true + }, + genScalar, + )) + + properties.Property(fmt.Sprintf("[G1] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + results := make([]G1Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) return false } } @@ -218,6 +235,81 @@ func TestMultiExpG1(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestCrossMultiExpG1(t *testing.T) { + const nbSamples = 1 << 14 + // multi exp points + var samplePoints [nbSamples]G1Affine + var g G1Jac + g.Set(&g1Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g1Gen) + } + + // sprinkle some points at infinity + rand.Seed(time.Now().UnixNano()) + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + + var sampleScalars [nbSamples]fr.Element + fillBenchScalars(sampleScalars[:]) + + // cRange is generated from template and contains the available parameters for the multiexp window size + cRange := []uint64{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} + if testing.Short() { + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange = []uint64{5, 16} + } + + results := make([]G1Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + + var r G1Jac + _innerMsmG1Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + + var expected, got G1Affine + expected.FromJacobian(&r) + + for i := 0; i < len(results); i++ { + got.FromJacobian(&results[i]) + if !expected.Equal(&got) { + t.Fatalf("cross msm failed with c=%d", cRange[i]) + } + } + +} + +// _innerMsmG1Reference always do ext jacobian with c == 16 +func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { + // partition the scalars + digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks) + + nbChunks := computeNbChunks(16) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g1JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16] + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG1Affine(p, int(16), chChunks[:]) +} + func BenchmarkMultiExpG1(b *testing.B) { const ( @@ -464,7 +556,6 @@ func TestMultiExpG2(t *testing.T) { func(mixer fr.Element) bool { var samplePointsZero [nbSamples]G2Affine - copy(samplePointsZero[:], samplePoints[:]) var expected G2Jac @@ -480,18 +571,36 @@ func TestMultiExpG2(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() - if i%10 == 0 { - samplePointsZero[i].setInfinity() - } + samplePointsZero[i-1].setInfinity() } results := make([]G2Jac, len(cRange)) for i, c := range cRange { _innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } - for i := 1; i < len(results); i++ { - if !results[i].Equal(&results[i-1]) { - t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) + return false + } + } + return true + }, + genScalar, + )) + + properties.Property(fmt.Sprintf("[G2] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + results := make([]G2Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) return false } } @@ -538,6 +647,79 @@ func TestMultiExpG2(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestCrossMultiExpG2(t *testing.T) { + const nbSamples = 1 << 14 + // multi exp points + var samplePoints [nbSamples]G2Affine + var g G2Jac + g.Set(&g2Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g2Gen) + } + + // sprinkle some points at infinity + rand.Seed(time.Now().UnixNano()) + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + + var sampleScalars [nbSamples]fr.Element + fillBenchScalars(sampleScalars[:]) + + // cRange is generated from template and contains the available parameters for the multiexp window size + // for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets. + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange := []uint64{5, 16} + + results := make([]G2Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + + var r G2Jac + _innerMsmG2Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + + var expected, got G2Affine + expected.FromJacobian(&r) + + for i := 0; i < len(results); i++ { + got.FromJacobian(&results[i]) + if !expected.Equal(&got) { + t.Fatalf("cross msm failed with c=%d", cRange[i]) + } + } + +} + +// _innerMsmG2Reference always do ext jacobian with c == 16 +func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { + // partition the scalars + digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks) + + nbChunks := computeNbChunks(16) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16] + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG2Affine(p, int(16), chChunks[:]) +} + func BenchmarkMultiExpG2(b *testing.B) { const ( diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index a6cf0fbe50..3307840f6a 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -144,7 +144,6 @@ func TestMultiExpG1(t *testing.T) { func(mixer fr.Element) bool { var samplePointsZero [nbSamples]G1Affine - copy(samplePointsZero[:], samplePoints[:]) var expected G1Jac @@ -160,18 +159,36 @@ func TestMultiExpG1(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() - if i%10 == 0 { - samplePointsZero[i].setInfinity() - } + samplePointsZero[i-1].setInfinity() } results := make([]G1Jac, len(cRange)) for i, c := range cRange { _innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } - for i := 1; i < len(results); i++ { - if !results[i].Equal(&results[i-1]) { - t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) + return false + } + } + return true + }, + genScalar, + )) + + properties.Property(fmt.Sprintf("[G1] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + results := make([]G1Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) return false } } @@ -218,6 +235,81 @@ func TestMultiExpG1(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestCrossMultiExpG1(t *testing.T) { + const nbSamples = 1 << 14 + // multi exp points + var samplePoints [nbSamples]G1Affine + var g G1Jac + g.Set(&g1Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g1Gen) + } + + // sprinkle some points at infinity + rand.Seed(time.Now().UnixNano()) + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + + var sampleScalars [nbSamples]fr.Element + fillBenchScalars(sampleScalars[:]) + + // cRange is generated from template and contains the available parameters for the multiexp window size + cRange := []uint64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} + if testing.Short() { + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange = []uint64{5, 16} + } + + results := make([]G1Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + + var r G1Jac + _innerMsmG1Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + + var expected, got G1Affine + expected.FromJacobian(&r) + + for i := 0; i < len(results); i++ { + got.FromJacobian(&results[i]) + if !expected.Equal(&got) { + t.Fatalf("cross msm failed with c=%d", cRange[i]) + } + } + +} + +// _innerMsmG1Reference always do ext jacobian with c == 16 +func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { + // partition the scalars + digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks) + + nbChunks := computeNbChunks(16) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g1JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16] + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG1Affine(p, int(16), chChunks[:]) +} + func BenchmarkMultiExpG1(b *testing.B) { const ( @@ -464,7 +556,6 @@ func TestMultiExpG2(t *testing.T) { func(mixer fr.Element) bool { var samplePointsZero [nbSamples]G2Affine - copy(samplePointsZero[:], samplePoints[:]) var expected G2Jac @@ -480,18 +571,36 @@ func TestMultiExpG2(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() - if i%10 == 0 { - samplePointsZero[i].setInfinity() - } + samplePointsZero[i-1].setInfinity() } results := make([]G2Jac, len(cRange)) for i, c := range cRange { _innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } - for i := 1; i < len(results); i++ { - if !results[i].Equal(&results[i-1]) { - t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) + return false + } + } + return true + }, + genScalar, + )) + + properties.Property(fmt.Sprintf("[G2] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + results := make([]G2Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) return false } } @@ -538,6 +647,79 @@ func TestMultiExpG2(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestCrossMultiExpG2(t *testing.T) { + const nbSamples = 1 << 14 + // multi exp points + var samplePoints [nbSamples]G2Affine + var g G2Jac + g.Set(&g2Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g2Gen) + } + + // sprinkle some points at infinity + rand.Seed(time.Now().UnixNano()) + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + + var sampleScalars [nbSamples]fr.Element + fillBenchScalars(sampleScalars[:]) + + // cRange is generated from template and contains the available parameters for the multiexp window size + // for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets. + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange := []uint64{5, 16} + + results := make([]G2Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + + var r G2Jac + _innerMsmG2Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + + var expected, got G2Affine + expected.FromJacobian(&r) + + for i := 0; i < len(results); i++ { + got.FromJacobian(&results[i]) + if !expected.Equal(&got) { + t.Fatalf("cross msm failed with c=%d", cRange[i]) + } + } + +} + +// _innerMsmG2Reference always do ext jacobian with c == 16 +func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { + // partition the scalars + digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks) + + nbChunks := computeNbChunks(16) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16] + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG2Affine(p, int(16), chChunks[:]) +} + func BenchmarkMultiExpG2(b *testing.B) { const ( diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index 8e5c73a1e6..dc7ef60c2c 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -144,7 +144,6 @@ func TestMultiExpG1(t *testing.T) { func(mixer fr.Element) bool { var samplePointsZero [nbSamples]G1Affine - copy(samplePointsZero[:], samplePoints[:]) var expected G1Jac @@ -160,18 +159,36 @@ func TestMultiExpG1(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() - if i%10 == 0 { - samplePointsZero[i].setInfinity() - } + samplePointsZero[i-1].setInfinity() } results := make([]G1Jac, len(cRange)) for i, c := range cRange { _innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } - for i := 1; i < len(results); i++ { - if !results[i].Equal(&results[i-1]) { - t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) + return false + } + } + return true + }, + genScalar, + )) + + properties.Property(fmt.Sprintf("[G1] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + results := make([]G1Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) return false } } @@ -218,6 +235,81 @@ func TestMultiExpG1(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestCrossMultiExpG1(t *testing.T) { + const nbSamples = 1 << 14 + // multi exp points + var samplePoints [nbSamples]G1Affine + var g G1Jac + g.Set(&g1Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g1Gen) + } + + // sprinkle some points at infinity + rand.Seed(time.Now().UnixNano()) + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + + var sampleScalars [nbSamples]fr.Element + fillBenchScalars(sampleScalars[:]) + + // cRange is generated from template and contains the available parameters for the multiexp window size + cRange := []uint64{4, 5, 8, 12, 16} + if testing.Short() { + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange = []uint64{5, 16} + } + + results := make([]G1Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + + var r G1Jac + _innerMsmG1Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + + var expected, got G1Affine + expected.FromJacobian(&r) + + for i := 0; i < len(results); i++ { + got.FromJacobian(&results[i]) + if !expected.Equal(&got) { + t.Fatalf("cross msm failed with c=%d", cRange[i]) + } + } + +} + +// _innerMsmG1Reference always do ext jacobian with c == 16 +func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { + // partition the scalars + digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks) + + nbChunks := computeNbChunks(16) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g1JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16] + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG1Affine(p, int(16), chChunks[:]) +} + func BenchmarkMultiExpG1(b *testing.B) { const ( @@ -464,7 +556,6 @@ func TestMultiExpG2(t *testing.T) { func(mixer fr.Element) bool { var samplePointsZero [nbSamples]G2Affine - copy(samplePointsZero[:], samplePoints[:]) var expected G2Jac @@ -480,18 +571,36 @@ func TestMultiExpG2(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() - if i%10 == 0 { - samplePointsZero[i].setInfinity() - } + samplePointsZero[i-1].setInfinity() } results := make([]G2Jac, len(cRange)) for i, c := range cRange { _innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } - for i := 1; i < len(results); i++ { - if !results[i].Equal(&results[i-1]) { - t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) + return false + } + } + return true + }, + genScalar, + )) + + properties.Property(fmt.Sprintf("[G2] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + results := make([]G2Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) return false } } @@ -538,6 +647,79 @@ func TestMultiExpG2(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestCrossMultiExpG2(t *testing.T) { + const nbSamples = 1 << 14 + // multi exp points + var samplePoints [nbSamples]G2Affine + var g G2Jac + g.Set(&g2Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g2Gen) + } + + // sprinkle some points at infinity + rand.Seed(time.Now().UnixNano()) + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + + var sampleScalars [nbSamples]fr.Element + fillBenchScalars(sampleScalars[:]) + + // cRange is generated from template and contains the available parameters for the multiexp window size + // for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets. + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange := []uint64{5, 16} + + results := make([]G2Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + + var r G2Jac + _innerMsmG2Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + + var expected, got G2Affine + expected.FromJacobian(&r) + + for i := 0; i < len(results); i++ { + got.FromJacobian(&results[i]) + if !expected.Equal(&got) { + t.Fatalf("cross msm failed with c=%d", cRange[i]) + } + } + +} + +// _innerMsmG2Reference always do ext jacobian with c == 16 +func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { + // partition the scalars + digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks) + + nbChunks := computeNbChunks(16) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16] + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG2Affine(p, int(16), chChunks[:]) +} + func BenchmarkMultiExpG2(b *testing.B) { const ( diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index 3daeb64fca..308efca4c9 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -144,7 +144,6 @@ func TestMultiExpG1(t *testing.T) { func(mixer fr.Element) bool { var samplePointsZero [nbSamples]G1Affine - copy(samplePointsZero[:], samplePoints[:]) var expected G1Jac @@ -160,18 +159,36 @@ func TestMultiExpG1(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() - if i%10 == 0 { - samplePointsZero[i].setInfinity() - } + samplePointsZero[i-1].setInfinity() } results := make([]G1Jac, len(cRange)) for i, c := range cRange { _innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } - for i := 1; i < len(results); i++ { - if !results[i].Equal(&results[i-1]) { - t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) + return false + } + } + return true + }, + genScalar, + )) + + properties.Property(fmt.Sprintf("[G1] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + results := make([]G1Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) return false } } @@ -218,6 +235,81 @@ func TestMultiExpG1(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestCrossMultiExpG1(t *testing.T) { + const nbSamples = 1 << 14 + // multi exp points + var samplePoints [nbSamples]G1Affine + var g G1Jac + g.Set(&g1Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g1Gen) + } + + // sprinkle some points at infinity + rand.Seed(time.Now().UnixNano()) + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + + var sampleScalars [nbSamples]fr.Element + fillBenchScalars(sampleScalars[:]) + + // cRange is generated from template and contains the available parameters for the multiexp window size + cRange := []uint64{3, 4, 5, 8, 11, 16} + if testing.Short() { + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange = []uint64{5, 16} + } + + results := make([]G1Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + + var r G1Jac + _innerMsmG1Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + + var expected, got G1Affine + expected.FromJacobian(&r) + + for i := 0; i < len(results); i++ { + got.FromJacobian(&results[i]) + if !expected.Equal(&got) { + t.Fatalf("cross msm failed with c=%d", cRange[i]) + } + } + +} + +// _innerMsmG1Reference always do ext jacobian with c == 16 +func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { + // partition the scalars + digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks) + + nbChunks := computeNbChunks(16) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g1JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16] + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG1Affine(p, int(16), chChunks[:]) +} + func BenchmarkMultiExpG1(b *testing.B) { const ( @@ -464,7 +556,6 @@ func TestMultiExpG2(t *testing.T) { func(mixer fr.Element) bool { var samplePointsZero [nbSamples]G2Affine - copy(samplePointsZero[:], samplePoints[:]) var expected G2Jac @@ -480,18 +571,36 @@ func TestMultiExpG2(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() - if i%10 == 0 { - samplePointsZero[i].setInfinity() - } + samplePointsZero[i-1].setInfinity() } results := make([]G2Jac, len(cRange)) for i, c := range cRange { _innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } - for i := 1; i < len(results); i++ { - if !results[i].Equal(&results[i-1]) { - t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) + return false + } + } + return true + }, + genScalar, + )) + + properties.Property(fmt.Sprintf("[G2] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + results := make([]G2Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) return false } } @@ -538,6 +647,79 @@ func TestMultiExpG2(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestCrossMultiExpG2(t *testing.T) { + const nbSamples = 1 << 14 + // multi exp points + var samplePoints [nbSamples]G2Affine + var g G2Jac + g.Set(&g2Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g2Gen) + } + + // sprinkle some points at infinity + rand.Seed(time.Now().UnixNano()) + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + + var sampleScalars [nbSamples]fr.Element + fillBenchScalars(sampleScalars[:]) + + // cRange is generated from template and contains the available parameters for the multiexp window size + // for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets. + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange := []uint64{5, 16} + + results := make([]G2Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + + var r G2Jac + _innerMsmG2Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + + var expected, got G2Affine + expected.FromJacobian(&r) + + for i := 0; i < len(results); i++ { + got.FromJacobian(&results[i]) + if !expected.Equal(&got) { + t.Fatalf("cross msm failed with c=%d", cRange[i]) + } + } + +} + +// _innerMsmG2Reference always do ext jacobian with c == 16 +func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { + // partition the scalars + digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks) + + nbChunks := computeNbChunks(16) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16] + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG2Affine(p, int(16), chChunks[:]) +} + func BenchmarkMultiExpG2(b *testing.B) { const ( diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index 8520cc7435..98560005bc 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -144,7 +144,6 @@ func TestMultiExpG1(t *testing.T) { func(mixer fr.Element) bool { var samplePointsZero [nbSamples]G1Affine - copy(samplePointsZero[:], samplePoints[:]) var expected G1Jac @@ -160,18 +159,36 @@ func TestMultiExpG1(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() - if i%10 == 0 { - samplePointsZero[i].setInfinity() - } + samplePointsZero[i-1].setInfinity() } results := make([]G1Jac, len(cRange)) for i, c := range cRange { _innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } - for i := 1; i < len(results); i++ { - if !results[i].Equal(&results[i-1]) { - t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) + return false + } + } + return true + }, + genScalar, + )) + + properties.Property(fmt.Sprintf("[G1] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + results := make([]G1Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) return false } } @@ -218,6 +235,81 @@ func TestMultiExpG1(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestCrossMultiExpG1(t *testing.T) { + const nbSamples = 1 << 14 + // multi exp points + var samplePoints [nbSamples]G1Affine + var g G1Jac + g.Set(&g1Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g1Gen) + } + + // sprinkle some points at infinity + rand.Seed(time.Now().UnixNano()) + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + + var sampleScalars [nbSamples]fr.Element + fillBenchScalars(sampleScalars[:]) + + // cRange is generated from template and contains the available parameters for the multiexp window size + cRange := []uint64{2, 3, 4, 5, 8, 10, 16} + if testing.Short() { + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange = []uint64{5, 16} + } + + results := make([]G1Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + + var r G1Jac + _innerMsmG1Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + + var expected, got G1Affine + expected.FromJacobian(&r) + + for i := 0; i < len(results); i++ { + got.FromJacobian(&results[i]) + if !expected.Equal(&got) { + t.Fatalf("cross msm failed with c=%d", cRange[i]) + } + } + +} + +// _innerMsmG1Reference always do ext jacobian with c == 16 +func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { + // partition the scalars + digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks) + + nbChunks := computeNbChunks(16) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g1JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g1JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16] + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG1Affine(p, int(16), chChunks[:]) +} + func BenchmarkMultiExpG1(b *testing.B) { const ( @@ -464,7 +556,6 @@ func TestMultiExpG2(t *testing.T) { func(mixer fr.Element) bool { var samplePointsZero [nbSamples]G2Affine - copy(samplePointsZero[:], samplePoints[:]) var expected G2Jac @@ -480,18 +571,36 @@ func TestMultiExpG2(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() - if i%10 == 0 { - samplePointsZero[i].setInfinity() - } + samplePointsZero[i-1].setInfinity() } results := make([]G2Jac, len(cRange)) for i, c := range cRange { _innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } - for i := 1; i < len(results); i++ { - if !results[i].Equal(&results[i-1]) { - t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i]) + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) + return false + } + } + return true + }, + genScalar, + )) + + properties.Property(fmt.Sprintf("[G2] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + results := make([]G2Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) return false } } @@ -538,6 +647,79 @@ func TestMultiExpG2(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestCrossMultiExpG2(t *testing.T) { + const nbSamples = 1 << 14 + // multi exp points + var samplePoints [nbSamples]G2Affine + var g G2Jac + g.Set(&g2Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&g2Gen) + } + + // sprinkle some points at infinity + rand.Seed(time.Now().UnixNano()) + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + + var sampleScalars [nbSamples]fr.Element + fillBenchScalars(sampleScalars[:]) + + // cRange is generated from template and contains the available parameters for the multiexp window size + // for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets. + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange := []uint64{5, 16} + + results := make([]G2Jac, len(cRange)) + for i, c := range cRange { + _innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + + var r G2Jac + _innerMsmG2Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + + var expected, got G2Affine + expected.FromJacobian(&r) + + for i := 0; i < len(results); i++ { + got.FromJacobian(&results[i]) + if !expected.Equal(&got) { + t.Fatalf("cross msm failed with c=%d", cRange[i]) + } + } + +} + +// _innerMsmG2Reference always do ext jacobian with c == 16 +func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { + // partition the scalars + digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks) + + nbChunks := computeNbChunks(16) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack and this is critical for performance + + // each go routine sends its result in chChunks[i] channel + chChunks := make([]chan g2JacExtended, nbChunks) + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan g2JacExtended, 1) + } + + // the last chunk may be processed with a different method than the rest, as it could be smaller. + n := len(points) + for j := int(nbChunks - 1); j >= 0; j-- { + processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16] + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunkG2Affine(p, int(16), chChunks[:]) +} + func BenchmarkMultiExpG2(b *testing.B) { const ( diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index 08c7acd6ff..d99a83a49f 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -29,7 +29,7 @@ import ( {{define "multiexp" }} -func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { +func TestMultiExp{{$.UPointName}}(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { @@ -72,7 +72,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { // ensure a multiexp that's splitted has the same result as a non-splitted one.. - properties.Property("[{{ toUpper $.PointName }}] Multi exponentation (c=16) should be consistent with splitted multiexp", prop.ForAll( + properties.Property("[{{ $.UPointName }}] Multi exponentation (c=16) should be consistent with splitted multiexp", prop.ForAll( func(mixer fr.Element) bool { var samplePointsLarge [nbSamples*13]{{ $.TAffine }} for i:=0; i<13; i++ { @@ -113,7 +113,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { cRange := []uint64{5, 16} {{- end}} - properties.Property(fmt.Sprintf("[{{ toUpper $.PointName }}] Multi exponentation (c in %v) should be consistent with sum of square", cRange), prop.ForAll( + properties.Property(fmt.Sprintf("[{{ $.UPointName }}] Multi exponentation (c in %v) should be consistent with sum of square", cRange), prop.ForAll( func(mixer fr.Element) bool { var expected {{ $.TJacobian }} @@ -148,11 +148,10 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { genScalar, )) - properties.Property(fmt.Sprintf("[{{ toUpper $.PointName }}] Multi exponentation (c in %v) of points at infinity should output a point at infinity", cRange), prop.ForAll( + properties.Property(fmt.Sprintf("[{{ $.UPointName }}] Multi exponentation (c in %v) of points at infinity should output a point at infinity", cRange), prop.ForAll( func(mixer fr.Element) bool { var samplePointsZero [nbSamples]{{ $.TAffine }} - copy(samplePointsZero[:], samplePoints[:]) var expected {{ $.TJacobian }} @@ -168,18 +167,37 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { sampleScalars[i-1].SetUint64(uint64(i)). Mul(&sampleScalars[i-1], &mixer). FromMont() - if i % 10 == 0 { - samplePointsZero[i].setInfinity() - } + samplePointsZero[i-1].setInfinity() } results := make([]{{ $.TJacobian }}, len(cRange)) for i, c := range cRange { _innerMsm{{ $.UPointName }}(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) } - for i := 1; i < len(results); i++ { - if !results[i].Equal(&results[i-1]) { - t.Logf("result for c=%d != c=%d", cRange[i-1],cRange[i]) + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) + return false + } + } + return true + }, + genScalar, + )) + + properties.Property(fmt.Sprintf("[{{ $.UPointName }}] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll( + func(mixer fr.Element) bool { + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + + results := make([]{{ $.TJacobian }}, len(cRange)) + for i, c := range cRange { + _innerMsm{{ $.UPointName }}(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + for i := 0; i < len(results); i++ { + if !results[i].Z.IsZero() { + t.Logf("result for c=%d is not infinity", cRange[i]) return false } } @@ -191,7 +209,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points - properties.Property("[{{ toUpper $.PointName }}] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( + properties.Property("[{{ $.UPointName }}] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll( func(mixer fr.Element) bool { var g {{ $.TJacobian }} @@ -228,10 +246,94 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } + +func TestCrossMultiExp{{ $.UPointName }}(t *testing.T) { + const nbSamples = 1 << 14 + // multi exp points + var samplePoints [nbSamples]{{ $.TAffine }} + var g {{ $.TJacobian }} + g.Set(&{{ toLower $.PointName }}Gen) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].FromJacobian(&g) + g.AddAssign(&{{ toLower $.PointName }}Gen) + } + + // sprinkle some points at infinity + rand.Seed(time.Now().UnixNano()) + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + samplePoints[rand.Intn(nbSamples)].setInfinity() + + var sampleScalars [nbSamples]fr.Element + fillBenchScalars(sampleScalars[:]) + + // cRange is generated from template and contains the available parameters for the multiexp window size + {{- if eq $.PointName "g1" }} + cRange := []uint64{ + {{- range $c := $.CRange}}{{- if gt $c 1}}{{$c}},{{- end}}{{- end}} + } + if testing.Short() { + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange = []uint64{5, 16} + } + {{- else }} + // for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets. + // test only "odd" and "even" (ie windows size divide word size vs not) + cRange := []uint64{5, 16} + {{- end}} + + results := make([]{{ $.TJacobian }}, len(cRange)) + for i, c := range cRange { + _innerMsm{{ $.UPointName }}(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + } + + var r {{ $.TJacobian }} + _innerMsm{{ $.UPointName }}Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()}) + + var expected, got {{ $.TAffine}} + expected.FromJacobian(&r) + + for i:=0; i= 0; j-- { + processChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C16] + go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n]) + } + + return msmReduceChunk{{ $.TAffine }}(p, int(16), chChunks[:]) +} -func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { +func BenchmarkMultiExp{{ $.UPointName }}(b *testing.B) { const ( pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits @@ -266,7 +368,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { } } - fillBenchBases{{ toUpper $.PointName }}(samplePoints[:]) + fillBenchBases{{ $.UPointName }}(samplePoints[:]) var testPoint {{ $.TAffine }} @@ -298,7 +400,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) { } -func BenchmarkMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) { +func BenchmarkMultiExp{{ $.UPointName }}Reference(b *testing.B) { const nbSamples = 1 << 20 var ( @@ -307,7 +409,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) { ) fillBenchScalars(sampleScalars[:]) - fillBenchBases{{ toUpper $.PointName }}(samplePoints[:]) + fillBenchBases{{ $.UPointName }}(samplePoints[:]) var testPoint {{ $.TAffine }} @@ -318,7 +420,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) { } -func BenchmarkManyMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) { +func BenchmarkManyMultiExp{{ $.UPointName }}Reference(b *testing.B) { const nbSamples = 1 << 20 var ( @@ -327,7 +429,7 @@ func BenchmarkManyMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) { ) fillBenchScalars(sampleScalars[:]) - fillBenchBases{{ toUpper $.PointName }}(samplePoints[:]) + fillBenchBases{{ $.UPointName }}(samplePoints[:]) var t1, t2, t3 {{ $.TAffine }} @@ -357,7 +459,7 @@ func BenchmarkManyMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) { // Rationale for generating points that are not on the curve is that for large benchmarks, generating // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add. -func fillBenchBases{{ toUpper $.PointName }}(samplePoints []{{ $.TAffine }}) { +func fillBenchBases{{ $.UPointName }}(samplePoints []{{ $.TAffine }}) { var r big.Int r.SetString("340444420969191673093399857471996460938405", 10) samplePoints[0].ScalarMultiplication(&samplePoints[0], &r) From 37ae24ef7e3861ef04c33d31361ac9dfecf2daad Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Wed, 16 Nov 2022 15:39:25 -0600 Subject: [PATCH 37/43] style: make staticcheck happier by code generating bucket sizes in clear --- ecc/bls12-377/multiexp_affine.go | 58 +++++++++--------- ecc/bls12-377/multiexp_jacobian.go | 60 +++++++++---------- ecc/bls12-378/multiexp_affine.go | 58 +++++++++--------- ecc/bls12-378/multiexp_jacobian.go | 60 +++++++++---------- ecc/bls12-381/multiexp_affine.go | 58 +++++++++--------- ecc/bls12-381/multiexp_jacobian.go | 60 +++++++++---------- ecc/bls24-315/multiexp_affine.go | 58 +++++++++--------- ecc/bls24-315/multiexp_jacobian.go | 60 +++++++++---------- ecc/bls24-317/multiexp_affine.go | 58 +++++++++--------- ecc/bls24-317/multiexp_jacobian.go | 60 +++++++++---------- ecc/bn254/multiexp_affine.go | 58 +++++++++--------- ecc/bn254/multiexp_jacobian.go | 60 +++++++++---------- ecc/bw6-633/multiexp_affine.go | 20 +++---- ecc/bw6-633/multiexp_jacobian.go | 24 ++++---- ecc/bw6-756/multiexp_affine.go | 20 +++---- ecc/bw6-756/multiexp_jacobian.go | 24 ++++---- ecc/bw6-761/multiexp_affine.go | 22 +++---- ecc/bw6-761/multiexp_jacobian.go | 28 ++++----- internal/generator/ecc/generate.go | 4 ++ .../ecc/template/multiexp_affine.go.tmpl | 4 +- .../ecc/template/multiexp_jacobian.go.tmpl | 2 +- 21 files changed, 430 insertions(+), 426 deletions(-) diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index 83ce91c32a..331f283ede 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -235,13 +235,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG1AffineC10 [1 << (10 - 1)]G1Affine -type bucketG1AffineC11 [1 << (11 - 1)]G1Affine -type bucketG1AffineC12 [1 << (12 - 1)]G1Affine -type bucketG1AffineC13 [1 << (13 - 1)]G1Affine -type bucketG1AffineC14 [1 << (14 - 1)]G1Affine -type bucketG1AffineC15 [1 << (15 - 1)]G1Affine -type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +type bucketG1AffineC10 [512]G1Affine +type bucketG1AffineC11 [1024]G1Affine +type bucketG1AffineC12 [2048]G1Affine +type bucketG1AffineC13 [4096]G1Affine +type bucketG1AffineC14 [8192]G1Affine +type bucketG1AffineC15 [16384]G1Affine +type bucketG1AffineC16 [32768]G1Affine // buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { @@ -554,13 +554,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG2AffineC10 [1 << (10 - 1)]G2Affine -type bucketG2AffineC11 [1 << (11 - 1)]G2Affine -type bucketG2AffineC12 [1 << (12 - 1)]G2Affine -type bucketG2AffineC13 [1 << (13 - 1)]G2Affine -type bucketG2AffineC14 [1 << (14 - 1)]G2Affine -type bucketG2AffineC15 [1 << (15 - 1)]G2Affine -type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +type bucketG2AffineC10 [512]G2Affine +type bucketG2AffineC11 [1024]G2Affine +type bucketG2AffineC12 [2048]G2Affine +type bucketG2AffineC13 [4096]G2Affine +type bucketG2AffineC14 [8192]G2Affine +type bucketG2AffineC15 [16384]G2Affine +type bucketG2AffineC16 [32768]G2Affine // buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { @@ -659,21 +659,21 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine -type bitSetC1 [1 << (1 - 1)]bool -type bitSetC2 [1 << (2 - 1)]bool -type bitSetC4 [1 << (4 - 1)]bool -type bitSetC5 [1 << (5 - 1)]bool -type bitSetC6 [1 << (6 - 1)]bool -type bitSetC7 [1 << (7 - 1)]bool -type bitSetC8 [1 << (8 - 1)]bool -type bitSetC9 [1 << (9 - 1)]bool -type bitSetC10 [1 << (10 - 1)]bool -type bitSetC11 [1 << (11 - 1)]bool -type bitSetC12 [1 << (12 - 1)]bool -type bitSetC13 [1 << (13 - 1)]bool -type bitSetC14 [1 << (14 - 1)]bool -type bitSetC15 [1 << (15 - 1)]bool -type bitSetC16 [1 << (16 - 1)]bool +type bitSetC1 [1]bool +type bitSetC2 [2]bool +type bitSetC4 [8]bool +type bitSetC5 [16]bool +type bitSetC6 [32]bool +type bitSetC7 [64]bool +type bitSetC8 [128]bool +type bitSetC9 [256]bool +type bitSetC10 [512]bool +type bitSetC11 [1024]bool +type bitSetC12 [2048]bool +type bitSetC13 [4096]bool +type bitSetC14 [8192]bool +type bitSetC15 [16384]bool +type bitSetC16 [32768]bool type bitSet interface { bitSetC1 | diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go index 8fd4e382ff..f34d5ff332 100644 --- a/ecc/bls12-377/multiexp_jacobian.go +++ b/ecc/bls12-377/multiexp_jacobian.go @@ -61,21 +61,21 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended -type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended -type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended -type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended -type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended -type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended -type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended -type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended -type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended -type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended -type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended -type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended -type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC1 [1]g1JacExtended +type bucketg1JacExtendedC2 [2]g1JacExtended +type bucketg1JacExtendedC4 [8]g1JacExtended +type bucketg1JacExtendedC5 [16]g1JacExtended +type bucketg1JacExtendedC6 [32]g1JacExtended +type bucketg1JacExtendedC7 [64]g1JacExtended +type bucketg1JacExtendedC8 [128]g1JacExtended +type bucketg1JacExtendedC9 [256]g1JacExtended +type bucketg1JacExtendedC10 [512]g1JacExtended +type bucketg1JacExtendedC11 [1024]g1JacExtended +type bucketg1JacExtendedC12 [2048]g1JacExtended +type bucketg1JacExtendedC13 [4096]g1JacExtended +type bucketg1JacExtendedC14 [8192]g1JacExtended +type bucketg1JacExtendedC15 [16384]g1JacExtended +type bucketg1JacExtendedC16 [32768]g1JacExtended type ibg1JacExtended interface { bucketg1JacExtendedC1 | @@ -140,21 +140,21 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended -type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended -type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended -type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended -type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended -type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended -type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended -type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended -type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended -type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended -type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended -type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended -type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC1 [1]g2JacExtended +type bucketg2JacExtendedC2 [2]g2JacExtended +type bucketg2JacExtendedC4 [8]g2JacExtended +type bucketg2JacExtendedC5 [16]g2JacExtended +type bucketg2JacExtendedC6 [32]g2JacExtended +type bucketg2JacExtendedC7 [64]g2JacExtended +type bucketg2JacExtendedC8 [128]g2JacExtended +type bucketg2JacExtendedC9 [256]g2JacExtended +type bucketg2JacExtendedC10 [512]g2JacExtended +type bucketg2JacExtendedC11 [1024]g2JacExtended +type bucketg2JacExtendedC12 [2048]g2JacExtended +type bucketg2JacExtendedC13 [4096]g2JacExtended +type bucketg2JacExtendedC14 [8192]g2JacExtended +type bucketg2JacExtendedC15 [16384]g2JacExtended +type bucketg2JacExtendedC16 [32768]g2JacExtended type ibg2JacExtended interface { bucketg2JacExtendedC1 | diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index d8b54b76ca..0f65e1838d 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -235,13 +235,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG1AffineC10 [1 << (10 - 1)]G1Affine -type bucketG1AffineC11 [1 << (11 - 1)]G1Affine -type bucketG1AffineC12 [1 << (12 - 1)]G1Affine -type bucketG1AffineC13 [1 << (13 - 1)]G1Affine -type bucketG1AffineC14 [1 << (14 - 1)]G1Affine -type bucketG1AffineC15 [1 << (15 - 1)]G1Affine -type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +type bucketG1AffineC10 [512]G1Affine +type bucketG1AffineC11 [1024]G1Affine +type bucketG1AffineC12 [2048]G1Affine +type bucketG1AffineC13 [4096]G1Affine +type bucketG1AffineC14 [8192]G1Affine +type bucketG1AffineC15 [16384]G1Affine +type bucketG1AffineC16 [32768]G1Affine // buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { @@ -554,13 +554,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG2AffineC10 [1 << (10 - 1)]G2Affine -type bucketG2AffineC11 [1 << (11 - 1)]G2Affine -type bucketG2AffineC12 [1 << (12 - 1)]G2Affine -type bucketG2AffineC13 [1 << (13 - 1)]G2Affine -type bucketG2AffineC14 [1 << (14 - 1)]G2Affine -type bucketG2AffineC15 [1 << (15 - 1)]G2Affine -type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +type bucketG2AffineC10 [512]G2Affine +type bucketG2AffineC11 [1024]G2Affine +type bucketG2AffineC12 [2048]G2Affine +type bucketG2AffineC13 [4096]G2Affine +type bucketG2AffineC14 [8192]G2Affine +type bucketG2AffineC15 [16384]G2Affine +type bucketG2AffineC16 [32768]G2Affine // buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { @@ -659,21 +659,21 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine -type bitSetC2 [1 << (2 - 1)]bool -type bitSetC3 [1 << (3 - 1)]bool -type bitSetC4 [1 << (4 - 1)]bool -type bitSetC5 [1 << (5 - 1)]bool -type bitSetC6 [1 << (6 - 1)]bool -type bitSetC7 [1 << (7 - 1)]bool -type bitSetC8 [1 << (8 - 1)]bool -type bitSetC9 [1 << (9 - 1)]bool -type bitSetC10 [1 << (10 - 1)]bool -type bitSetC11 [1 << (11 - 1)]bool -type bitSetC12 [1 << (12 - 1)]bool -type bitSetC13 [1 << (13 - 1)]bool -type bitSetC14 [1 << (14 - 1)]bool -type bitSetC15 [1 << (15 - 1)]bool -type bitSetC16 [1 << (16 - 1)]bool +type bitSetC2 [2]bool +type bitSetC3 [4]bool +type bitSetC4 [8]bool +type bitSetC5 [16]bool +type bitSetC6 [32]bool +type bitSetC7 [64]bool +type bitSetC8 [128]bool +type bitSetC9 [256]bool +type bitSetC10 [512]bool +type bitSetC11 [1024]bool +type bitSetC12 [2048]bool +type bitSetC13 [4096]bool +type bitSetC14 [8192]bool +type bitSetC15 [16384]bool +type bitSetC16 [32768]bool type bitSet interface { bitSetC2 | diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go index eb83e3c1c2..0e9d572e73 100644 --- a/ecc/bls12-378/multiexp_jacobian.go +++ b/ecc/bls12-378/multiexp_jacobian.go @@ -61,21 +61,21 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended -type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended -type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended -type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended -type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended -type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended -type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended -type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended -type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended -type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended -type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended -type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC2 [2]g1JacExtended +type bucketg1JacExtendedC3 [4]g1JacExtended +type bucketg1JacExtendedC4 [8]g1JacExtended +type bucketg1JacExtendedC5 [16]g1JacExtended +type bucketg1JacExtendedC6 [32]g1JacExtended +type bucketg1JacExtendedC7 [64]g1JacExtended +type bucketg1JacExtendedC8 [128]g1JacExtended +type bucketg1JacExtendedC9 [256]g1JacExtended +type bucketg1JacExtendedC10 [512]g1JacExtended +type bucketg1JacExtendedC11 [1024]g1JacExtended +type bucketg1JacExtendedC12 [2048]g1JacExtended +type bucketg1JacExtendedC13 [4096]g1JacExtended +type bucketg1JacExtendedC14 [8192]g1JacExtended +type bucketg1JacExtendedC15 [16384]g1JacExtended +type bucketg1JacExtendedC16 [32768]g1JacExtended type ibg1JacExtended interface { bucketg1JacExtendedC2 | @@ -140,21 +140,21 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended -type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended -type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended -type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended -type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended -type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended -type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended -type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended -type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended -type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended -type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended -type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC2 [2]g2JacExtended +type bucketg2JacExtendedC3 [4]g2JacExtended +type bucketg2JacExtendedC4 [8]g2JacExtended +type bucketg2JacExtendedC5 [16]g2JacExtended +type bucketg2JacExtendedC6 [32]g2JacExtended +type bucketg2JacExtendedC7 [64]g2JacExtended +type bucketg2JacExtendedC8 [128]g2JacExtended +type bucketg2JacExtendedC9 [256]g2JacExtended +type bucketg2JacExtendedC10 [512]g2JacExtended +type bucketg2JacExtendedC11 [1024]g2JacExtended +type bucketg2JacExtendedC12 [2048]g2JacExtended +type bucketg2JacExtendedC13 [4096]g2JacExtended +type bucketg2JacExtendedC14 [8192]g2JacExtended +type bucketg2JacExtendedC15 [16384]g2JacExtended +type bucketg2JacExtendedC16 [32768]g2JacExtended type ibg2JacExtended interface { bucketg2JacExtendedC2 | diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index bfc282b553..c566026823 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -235,13 +235,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG1AffineC10 [1 << (10 - 1)]G1Affine -type bucketG1AffineC11 [1 << (11 - 1)]G1Affine -type bucketG1AffineC12 [1 << (12 - 1)]G1Affine -type bucketG1AffineC13 [1 << (13 - 1)]G1Affine -type bucketG1AffineC14 [1 << (14 - 1)]G1Affine -type bucketG1AffineC15 [1 << (15 - 1)]G1Affine -type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +type bucketG1AffineC10 [512]G1Affine +type bucketG1AffineC11 [1024]G1Affine +type bucketG1AffineC12 [2048]G1Affine +type bucketG1AffineC13 [4096]G1Affine +type bucketG1AffineC14 [8192]G1Affine +type bucketG1AffineC15 [16384]G1Affine +type bucketG1AffineC16 [32768]G1Affine // buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { @@ -554,13 +554,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG2AffineC10 [1 << (10 - 1)]G2Affine -type bucketG2AffineC11 [1 << (11 - 1)]G2Affine -type bucketG2AffineC12 [1 << (12 - 1)]G2Affine -type bucketG2AffineC13 [1 << (13 - 1)]G2Affine -type bucketG2AffineC14 [1 << (14 - 1)]G2Affine -type bucketG2AffineC15 [1 << (15 - 1)]G2Affine -type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +type bucketG2AffineC10 [512]G2Affine +type bucketG2AffineC11 [1024]G2Affine +type bucketG2AffineC12 [2048]G2Affine +type bucketG2AffineC13 [4096]G2Affine +type bucketG2AffineC14 [8192]G2Affine +type bucketG2AffineC15 [16384]G2Affine +type bucketG2AffineC16 [32768]G2Affine // buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { @@ -659,21 +659,21 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine -type bitSetC1 [1 << (1 - 1)]bool -type bitSetC3 [1 << (3 - 1)]bool -type bitSetC4 [1 << (4 - 1)]bool -type bitSetC5 [1 << (5 - 1)]bool -type bitSetC6 [1 << (6 - 1)]bool -type bitSetC7 [1 << (7 - 1)]bool -type bitSetC8 [1 << (8 - 1)]bool -type bitSetC9 [1 << (9 - 1)]bool -type bitSetC10 [1 << (10 - 1)]bool -type bitSetC11 [1 << (11 - 1)]bool -type bitSetC12 [1 << (12 - 1)]bool -type bitSetC13 [1 << (13 - 1)]bool -type bitSetC14 [1 << (14 - 1)]bool -type bitSetC15 [1 << (15 - 1)]bool -type bitSetC16 [1 << (16 - 1)]bool +type bitSetC1 [1]bool +type bitSetC3 [4]bool +type bitSetC4 [8]bool +type bitSetC5 [16]bool +type bitSetC6 [32]bool +type bitSetC7 [64]bool +type bitSetC8 [128]bool +type bitSetC9 [256]bool +type bitSetC10 [512]bool +type bitSetC11 [1024]bool +type bitSetC12 [2048]bool +type bitSetC13 [4096]bool +type bitSetC14 [8192]bool +type bitSetC15 [16384]bool +type bitSetC16 [32768]bool type bitSet interface { bitSetC1 | diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go index bc304041f6..55cdd766b5 100644 --- a/ecc/bls12-381/multiexp_jacobian.go +++ b/ecc/bls12-381/multiexp_jacobian.go @@ -61,21 +61,21 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended -type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended -type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended -type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended -type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended -type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended -type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended -type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended -type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended -type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended -type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended -type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC1 [1]g1JacExtended +type bucketg1JacExtendedC3 [4]g1JacExtended +type bucketg1JacExtendedC4 [8]g1JacExtended +type bucketg1JacExtendedC5 [16]g1JacExtended +type bucketg1JacExtendedC6 [32]g1JacExtended +type bucketg1JacExtendedC7 [64]g1JacExtended +type bucketg1JacExtendedC8 [128]g1JacExtended +type bucketg1JacExtendedC9 [256]g1JacExtended +type bucketg1JacExtendedC10 [512]g1JacExtended +type bucketg1JacExtendedC11 [1024]g1JacExtended +type bucketg1JacExtendedC12 [2048]g1JacExtended +type bucketg1JacExtendedC13 [4096]g1JacExtended +type bucketg1JacExtendedC14 [8192]g1JacExtended +type bucketg1JacExtendedC15 [16384]g1JacExtended +type bucketg1JacExtendedC16 [32768]g1JacExtended type ibg1JacExtended interface { bucketg1JacExtendedC1 | @@ -140,21 +140,21 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended -type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended -type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended -type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended -type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended -type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended -type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended -type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended -type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended -type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended -type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended -type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC1 [1]g2JacExtended +type bucketg2JacExtendedC3 [4]g2JacExtended +type bucketg2JacExtendedC4 [8]g2JacExtended +type bucketg2JacExtendedC5 [16]g2JacExtended +type bucketg2JacExtendedC6 [32]g2JacExtended +type bucketg2JacExtendedC7 [64]g2JacExtended +type bucketg2JacExtendedC8 [128]g2JacExtended +type bucketg2JacExtendedC9 [256]g2JacExtended +type bucketg2JacExtendedC10 [512]g2JacExtended +type bucketg2JacExtendedC11 [1024]g2JacExtended +type bucketg2JacExtendedC12 [2048]g2JacExtended +type bucketg2JacExtendedC13 [4096]g2JacExtended +type bucketg2JacExtendedC14 [8192]g2JacExtended +type bucketg2JacExtendedC15 [16384]g2JacExtended +type bucketg2JacExtendedC16 [32768]g2JacExtended type ibg2JacExtended interface { bucketg2JacExtendedC1 | diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index 4e679fea95..4bf2f7f50c 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -235,13 +235,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG1AffineC10 [1 << (10 - 1)]G1Affine -type bucketG1AffineC11 [1 << (11 - 1)]G1Affine -type bucketG1AffineC12 [1 << (12 - 1)]G1Affine -type bucketG1AffineC13 [1 << (13 - 1)]G1Affine -type bucketG1AffineC14 [1 << (14 - 1)]G1Affine -type bucketG1AffineC15 [1 << (15 - 1)]G1Affine -type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +type bucketG1AffineC10 [512]G1Affine +type bucketG1AffineC11 [1024]G1Affine +type bucketG1AffineC12 [2048]G1Affine +type bucketG1AffineC13 [4096]G1Affine +type bucketG1AffineC14 [8192]G1Affine +type bucketG1AffineC15 [16384]G1Affine +type bucketG1AffineC16 [32768]G1Affine // buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { @@ -554,13 +554,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG2AffineC10 [1 << (10 - 1)]G2Affine -type bucketG2AffineC11 [1 << (11 - 1)]G2Affine -type bucketG2AffineC12 [1 << (12 - 1)]G2Affine -type bucketG2AffineC13 [1 << (13 - 1)]G2Affine -type bucketG2AffineC14 [1 << (14 - 1)]G2Affine -type bucketG2AffineC15 [1 << (15 - 1)]G2Affine -type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +type bucketG2AffineC10 [512]G2Affine +type bucketG2AffineC11 [1024]G2Affine +type bucketG2AffineC12 [2048]G2Affine +type bucketG2AffineC13 [4096]G2Affine +type bucketG2AffineC14 [8192]G2Affine +type bucketG2AffineC15 [16384]G2Affine +type bucketG2AffineC16 [32768]G2Affine // buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { @@ -659,21 +659,21 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine -type bitSetC1 [1 << (1 - 1)]bool -type bitSetC2 [1 << (2 - 1)]bool -type bitSetC4 [1 << (4 - 1)]bool -type bitSetC5 [1 << (5 - 1)]bool -type bitSetC6 [1 << (6 - 1)]bool -type bitSetC7 [1 << (7 - 1)]bool -type bitSetC8 [1 << (8 - 1)]bool -type bitSetC9 [1 << (9 - 1)]bool -type bitSetC10 [1 << (10 - 1)]bool -type bitSetC11 [1 << (11 - 1)]bool -type bitSetC12 [1 << (12 - 1)]bool -type bitSetC13 [1 << (13 - 1)]bool -type bitSetC14 [1 << (14 - 1)]bool -type bitSetC15 [1 << (15 - 1)]bool -type bitSetC16 [1 << (16 - 1)]bool +type bitSetC1 [1]bool +type bitSetC2 [2]bool +type bitSetC4 [8]bool +type bitSetC5 [16]bool +type bitSetC6 [32]bool +type bitSetC7 [64]bool +type bitSetC8 [128]bool +type bitSetC9 [256]bool +type bitSetC10 [512]bool +type bitSetC11 [1024]bool +type bitSetC12 [2048]bool +type bitSetC13 [4096]bool +type bitSetC14 [8192]bool +type bitSetC15 [16384]bool +type bitSetC16 [32768]bool type bitSet interface { bitSetC1 | diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go index 23310862df..0cd3432dad 100644 --- a/ecc/bls24-315/multiexp_jacobian.go +++ b/ecc/bls24-315/multiexp_jacobian.go @@ -61,21 +61,21 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended -type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended -type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended -type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended -type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended -type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended -type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended -type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended -type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended -type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended -type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended -type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended -type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC1 [1]g1JacExtended +type bucketg1JacExtendedC2 [2]g1JacExtended +type bucketg1JacExtendedC4 [8]g1JacExtended +type bucketg1JacExtendedC5 [16]g1JacExtended +type bucketg1JacExtendedC6 [32]g1JacExtended +type bucketg1JacExtendedC7 [64]g1JacExtended +type bucketg1JacExtendedC8 [128]g1JacExtended +type bucketg1JacExtendedC9 [256]g1JacExtended +type bucketg1JacExtendedC10 [512]g1JacExtended +type bucketg1JacExtendedC11 [1024]g1JacExtended +type bucketg1JacExtendedC12 [2048]g1JacExtended +type bucketg1JacExtendedC13 [4096]g1JacExtended +type bucketg1JacExtendedC14 [8192]g1JacExtended +type bucketg1JacExtendedC15 [16384]g1JacExtended +type bucketg1JacExtendedC16 [32768]g1JacExtended type ibg1JacExtended interface { bucketg1JacExtendedC1 | @@ -140,21 +140,21 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended -type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended -type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended -type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended -type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended -type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended -type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended -type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended -type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended -type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended -type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended -type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended -type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC1 [1]g2JacExtended +type bucketg2JacExtendedC2 [2]g2JacExtended +type bucketg2JacExtendedC4 [8]g2JacExtended +type bucketg2JacExtendedC5 [16]g2JacExtended +type bucketg2JacExtendedC6 [32]g2JacExtended +type bucketg2JacExtendedC7 [64]g2JacExtended +type bucketg2JacExtendedC8 [128]g2JacExtended +type bucketg2JacExtendedC9 [256]g2JacExtended +type bucketg2JacExtendedC10 [512]g2JacExtended +type bucketg2JacExtendedC11 [1024]g2JacExtended +type bucketg2JacExtendedC12 [2048]g2JacExtended +type bucketg2JacExtendedC13 [4096]g2JacExtended +type bucketg2JacExtendedC14 [8192]g2JacExtended +type bucketg2JacExtendedC15 [16384]g2JacExtended +type bucketg2JacExtendedC16 [32768]g2JacExtended type ibg2JacExtended interface { bucketg2JacExtendedC1 | diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index f657bf2bcf..5cc17e3cc8 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -235,13 +235,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG1AffineC10 [1 << (10 - 1)]G1Affine -type bucketG1AffineC11 [1 << (11 - 1)]G1Affine -type bucketG1AffineC12 [1 << (12 - 1)]G1Affine -type bucketG1AffineC13 [1 << (13 - 1)]G1Affine -type bucketG1AffineC14 [1 << (14 - 1)]G1Affine -type bucketG1AffineC15 [1 << (15 - 1)]G1Affine -type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +type bucketG1AffineC10 [512]G1Affine +type bucketG1AffineC11 [1024]G1Affine +type bucketG1AffineC12 [2048]G1Affine +type bucketG1AffineC13 [4096]G1Affine +type bucketG1AffineC14 [8192]G1Affine +type bucketG1AffineC15 [16384]G1Affine +type bucketG1AffineC16 [32768]G1Affine // buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { @@ -554,13 +554,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG2AffineC10 [1 << (10 - 1)]G2Affine -type bucketG2AffineC11 [1 << (11 - 1)]G2Affine -type bucketG2AffineC12 [1 << (12 - 1)]G2Affine -type bucketG2AffineC13 [1 << (13 - 1)]G2Affine -type bucketG2AffineC14 [1 << (14 - 1)]G2Affine -type bucketG2AffineC15 [1 << (15 - 1)]G2Affine -type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +type bucketG2AffineC10 [512]G2Affine +type bucketG2AffineC11 [1024]G2Affine +type bucketG2AffineC12 [2048]G2Affine +type bucketG2AffineC13 [4096]G2Affine +type bucketG2AffineC14 [8192]G2Affine +type bucketG2AffineC15 [16384]G2Affine +type bucketG2AffineC16 [32768]G2Affine // buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { @@ -659,21 +659,21 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine -type bitSetC1 [1 << (1 - 1)]bool -type bitSetC3 [1 << (3 - 1)]bool -type bitSetC4 [1 << (4 - 1)]bool -type bitSetC5 [1 << (5 - 1)]bool -type bitSetC6 [1 << (6 - 1)]bool -type bitSetC7 [1 << (7 - 1)]bool -type bitSetC8 [1 << (8 - 1)]bool -type bitSetC9 [1 << (9 - 1)]bool -type bitSetC10 [1 << (10 - 1)]bool -type bitSetC11 [1 << (11 - 1)]bool -type bitSetC12 [1 << (12 - 1)]bool -type bitSetC13 [1 << (13 - 1)]bool -type bitSetC14 [1 << (14 - 1)]bool -type bitSetC15 [1 << (15 - 1)]bool -type bitSetC16 [1 << (16 - 1)]bool +type bitSetC1 [1]bool +type bitSetC3 [4]bool +type bitSetC4 [8]bool +type bitSetC5 [16]bool +type bitSetC6 [32]bool +type bitSetC7 [64]bool +type bitSetC8 [128]bool +type bitSetC9 [256]bool +type bitSetC10 [512]bool +type bitSetC11 [1024]bool +type bitSetC12 [2048]bool +type bitSetC13 [4096]bool +type bitSetC14 [8192]bool +type bitSetC15 [16384]bool +type bitSetC16 [32768]bool type bitSet interface { bitSetC1 | diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go index 973219cc4b..6ecfd659e9 100644 --- a/ecc/bls24-317/multiexp_jacobian.go +++ b/ecc/bls24-317/multiexp_jacobian.go @@ -61,21 +61,21 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended -type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended -type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended -type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended -type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended -type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended -type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended -type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended -type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended -type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended -type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended -type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC1 [1]g1JacExtended +type bucketg1JacExtendedC3 [4]g1JacExtended +type bucketg1JacExtendedC4 [8]g1JacExtended +type bucketg1JacExtendedC5 [16]g1JacExtended +type bucketg1JacExtendedC6 [32]g1JacExtended +type bucketg1JacExtendedC7 [64]g1JacExtended +type bucketg1JacExtendedC8 [128]g1JacExtended +type bucketg1JacExtendedC9 [256]g1JacExtended +type bucketg1JacExtendedC10 [512]g1JacExtended +type bucketg1JacExtendedC11 [1024]g1JacExtended +type bucketg1JacExtendedC12 [2048]g1JacExtended +type bucketg1JacExtendedC13 [4096]g1JacExtended +type bucketg1JacExtendedC14 [8192]g1JacExtended +type bucketg1JacExtendedC15 [16384]g1JacExtended +type bucketg1JacExtendedC16 [32768]g1JacExtended type ibg1JacExtended interface { bucketg1JacExtendedC1 | @@ -140,21 +140,21 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended -type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended -type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended -type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended -type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended -type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended -type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended -type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended -type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended -type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended -type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended -type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC1 [1]g2JacExtended +type bucketg2JacExtendedC3 [4]g2JacExtended +type bucketg2JacExtendedC4 [8]g2JacExtended +type bucketg2JacExtendedC5 [16]g2JacExtended +type bucketg2JacExtendedC6 [32]g2JacExtended +type bucketg2JacExtendedC7 [64]g2JacExtended +type bucketg2JacExtendedC8 [128]g2JacExtended +type bucketg2JacExtendedC9 [256]g2JacExtended +type bucketg2JacExtendedC10 [512]g2JacExtended +type bucketg2JacExtendedC11 [1024]g2JacExtended +type bucketg2JacExtendedC12 [2048]g2JacExtended +type bucketg2JacExtendedC13 [4096]g2JacExtended +type bucketg2JacExtendedC14 [8192]g2JacExtended +type bucketg2JacExtendedC15 [16384]g2JacExtended +type bucketg2JacExtendedC16 [32768]g2JacExtended type ibg2JacExtended interface { bucketg2JacExtendedC1 | diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index 1f132b885e..d38581dbb6 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -235,13 +235,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG1AffineC10 [1 << (10 - 1)]G1Affine -type bucketG1AffineC11 [1 << (11 - 1)]G1Affine -type bucketG1AffineC12 [1 << (12 - 1)]G1Affine -type bucketG1AffineC13 [1 << (13 - 1)]G1Affine -type bucketG1AffineC14 [1 << (14 - 1)]G1Affine -type bucketG1AffineC15 [1 << (15 - 1)]G1Affine -type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +type bucketG1AffineC10 [512]G1Affine +type bucketG1AffineC11 [1024]G1Affine +type bucketG1AffineC12 [2048]G1Affine +type bucketG1AffineC13 [4096]G1Affine +type bucketG1AffineC14 [8192]G1Affine +type bucketG1AffineC15 [16384]G1Affine +type bucketG1AffineC16 [32768]G1Affine // buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { @@ -554,13 +554,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG2AffineC10 [1 << (10 - 1)]G2Affine -type bucketG2AffineC11 [1 << (11 - 1)]G2Affine -type bucketG2AffineC12 [1 << (12 - 1)]G2Affine -type bucketG2AffineC13 [1 << (13 - 1)]G2Affine -type bucketG2AffineC14 [1 << (14 - 1)]G2Affine -type bucketG2AffineC15 [1 << (15 - 1)]G2Affine -type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +type bucketG2AffineC10 [512]G2Affine +type bucketG2AffineC11 [1024]G2Affine +type bucketG2AffineC12 [2048]G2Affine +type bucketG2AffineC13 [4096]G2Affine +type bucketG2AffineC14 [8192]G2Affine +type bucketG2AffineC15 [16384]G2Affine +type bucketG2AffineC16 [32768]G2Affine // buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { @@ -659,21 +659,21 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine -type bitSetC2 [1 << (2 - 1)]bool -type bitSetC3 [1 << (3 - 1)]bool -type bitSetC4 [1 << (4 - 1)]bool -type bitSetC5 [1 << (5 - 1)]bool -type bitSetC6 [1 << (6 - 1)]bool -type bitSetC7 [1 << (7 - 1)]bool -type bitSetC8 [1 << (8 - 1)]bool -type bitSetC9 [1 << (9 - 1)]bool -type bitSetC10 [1 << (10 - 1)]bool -type bitSetC11 [1 << (11 - 1)]bool -type bitSetC12 [1 << (12 - 1)]bool -type bitSetC13 [1 << (13 - 1)]bool -type bitSetC14 [1 << (14 - 1)]bool -type bitSetC15 [1 << (15 - 1)]bool -type bitSetC16 [1 << (16 - 1)]bool +type bitSetC2 [2]bool +type bitSetC3 [4]bool +type bitSetC4 [8]bool +type bitSetC5 [16]bool +type bitSetC6 [32]bool +type bitSetC7 [64]bool +type bitSetC8 [128]bool +type bitSetC9 [256]bool +type bitSetC10 [512]bool +type bitSetC11 [1024]bool +type bitSetC12 [2048]bool +type bitSetC13 [4096]bool +type bitSetC14 [8192]bool +type bitSetC15 [16384]bool +type bitSetC16 [32768]bool type bitSet interface { bitSetC2 | diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go index a674d4f724..0bd2482a98 100644 --- a/ecc/bn254/multiexp_jacobian.go +++ b/ecc/bn254/multiexp_jacobian.go @@ -61,21 +61,21 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended -type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended -type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended -type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended -type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended -type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended -type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended -type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended -type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended -type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended -type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended -type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC2 [2]g1JacExtended +type bucketg1JacExtendedC3 [4]g1JacExtended +type bucketg1JacExtendedC4 [8]g1JacExtended +type bucketg1JacExtendedC5 [16]g1JacExtended +type bucketg1JacExtendedC6 [32]g1JacExtended +type bucketg1JacExtendedC7 [64]g1JacExtended +type bucketg1JacExtendedC8 [128]g1JacExtended +type bucketg1JacExtendedC9 [256]g1JacExtended +type bucketg1JacExtendedC10 [512]g1JacExtended +type bucketg1JacExtendedC11 [1024]g1JacExtended +type bucketg1JacExtendedC12 [2048]g1JacExtended +type bucketg1JacExtendedC13 [4096]g1JacExtended +type bucketg1JacExtendedC14 [8192]g1JacExtended +type bucketg1JacExtendedC15 [16384]g1JacExtended +type bucketg1JacExtendedC16 [32768]g1JacExtended type ibg1JacExtended interface { bucketg1JacExtendedC2 | @@ -140,21 +140,21 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended -type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended -type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended -type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended -type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended -type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended -type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended -type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended -type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended -type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended -type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended -type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC2 [2]g2JacExtended +type bucketg2JacExtendedC3 [4]g2JacExtended +type bucketg2JacExtendedC4 [8]g2JacExtended +type bucketg2JacExtendedC5 [16]g2JacExtended +type bucketg2JacExtendedC6 [32]g2JacExtended +type bucketg2JacExtendedC7 [64]g2JacExtended +type bucketg2JacExtendedC8 [128]g2JacExtended +type bucketg2JacExtendedC9 [256]g2JacExtended +type bucketg2JacExtendedC10 [512]g2JacExtended +type bucketg2JacExtendedC11 [1024]g2JacExtended +type bucketg2JacExtendedC12 [2048]g2JacExtended +type bucketg2JacExtendedC13 [4096]g2JacExtended +type bucketg2JacExtendedC14 [8192]g2JacExtended +type bucketg2JacExtendedC15 [16384]g2JacExtended +type bucketg2JacExtendedC16 [32768]g2JacExtended type ibg2JacExtended interface { bucketg2JacExtendedC2 | diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index 949a53f642..d05f2ce04f 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -234,8 +234,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG1AffineC12 [1 << (12 - 1)]G1Affine -type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +type bucketG1AffineC12 [2048]G1Affine +type bucketG1AffineC16 [32768]G1Affine // buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { @@ -493,8 +493,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG2AffineC12 [1 << (12 - 1)]G2Affine -type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +type bucketG2AffineC12 [2048]G2Affine +type bucketG2AffineC16 [32768]G2Affine // buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { @@ -538,12 +538,12 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine -type bitSetC1 [1 << (1 - 1)]bool -type bitSetC4 [1 << (4 - 1)]bool -type bitSetC5 [1 << (5 - 1)]bool -type bitSetC8 [1 << (8 - 1)]bool -type bitSetC12 [1 << (12 - 1)]bool -type bitSetC16 [1 << (16 - 1)]bool +type bitSetC1 [1]bool +type bitSetC4 [8]bool +type bitSetC5 [16]bool +type bitSetC8 [128]bool +type bitSetC12 [2048]bool +type bitSetC16 [32768]bool type bitSet interface { bitSetC1 | diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go index 497f2697fb..eb4a8a2a02 100644 --- a/ecc/bw6-633/multiexp_jacobian.go +++ b/ecc/bw6-633/multiexp_jacobian.go @@ -61,12 +61,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended -type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended -type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended -type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC1 [1]g1JacExtended +type bucketg1JacExtendedC4 [8]g1JacExtended +type bucketg1JacExtendedC5 [16]g1JacExtended +type bucketg1JacExtendedC8 [128]g1JacExtended +type bucketg1JacExtendedC12 [2048]g1JacExtended +type bucketg1JacExtendedC16 [32768]g1JacExtended type ibg1JacExtended interface { bucketg1JacExtendedC1 | @@ -122,12 +122,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended -type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended -type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended -type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC1 [1]g2JacExtended +type bucketg2JacExtendedC4 [8]g2JacExtended +type bucketg2JacExtendedC5 [16]g2JacExtended +type bucketg2JacExtendedC8 [128]g2JacExtended +type bucketg2JacExtendedC12 [2048]g2JacExtended +type bucketg2JacExtendedC16 [32768]g2JacExtended type ibg2JacExtended interface { bucketg2JacExtendedC1 | diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index 83cd6d1d61..968db46e6f 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -234,8 +234,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG1AffineC11 [1 << (11 - 1)]G1Affine -type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +type bucketG1AffineC11 [1024]G1Affine +type bucketG1AffineC16 [32768]G1Affine // buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { @@ -493,8 +493,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG2AffineC11 [1 << (11 - 1)]G2Affine -type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +type bucketG2AffineC11 [1024]G2Affine +type bucketG2AffineC16 [32768]G2Affine // buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { @@ -538,12 +538,12 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine -type bitSetC3 [1 << (3 - 1)]bool -type bitSetC4 [1 << (4 - 1)]bool -type bitSetC5 [1 << (5 - 1)]bool -type bitSetC8 [1 << (8 - 1)]bool -type bitSetC11 [1 << (11 - 1)]bool -type bitSetC16 [1 << (16 - 1)]bool +type bitSetC3 [4]bool +type bitSetC4 [8]bool +type bitSetC5 [16]bool +type bitSetC8 [128]bool +type bitSetC11 [1024]bool +type bitSetC16 [32768]bool type bitSet interface { bitSetC3 | diff --git a/ecc/bw6-756/multiexp_jacobian.go b/ecc/bw6-756/multiexp_jacobian.go index 93fd87fe51..cd15044132 100644 --- a/ecc/bw6-756/multiexp_jacobian.go +++ b/ecc/bw6-756/multiexp_jacobian.go @@ -61,12 +61,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended -type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended -type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended -type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC3 [4]g1JacExtended +type bucketg1JacExtendedC4 [8]g1JacExtended +type bucketg1JacExtendedC5 [16]g1JacExtended +type bucketg1JacExtendedC8 [128]g1JacExtended +type bucketg1JacExtendedC11 [1024]g1JacExtended +type bucketg1JacExtendedC16 [32768]g1JacExtended type ibg1JacExtended interface { bucketg1JacExtendedC3 | @@ -122,12 +122,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended -type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended -type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended -type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC3 [4]g2JacExtended +type bucketg2JacExtendedC4 [8]g2JacExtended +type bucketg2JacExtendedC5 [16]g2JacExtended +type bucketg2JacExtendedC8 [128]g2JacExtended +type bucketg2JacExtendedC11 [1024]g2JacExtended +type bucketg2JacExtendedC16 [32768]g2JacExtended type ibg2JacExtended interface { bucketg2JacExtendedC3 | diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index bfeea763cb..91750cd328 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -234,8 +234,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG1AffineC10 [1 << (10 - 1)]G1Affine -type bucketG1AffineC16 [1 << (16 - 1)]G1Affine +type bucketG1AffineC10 [512]G1Affine +type bucketG1AffineC16 [32768]G1Affine // buckets: array of G1Affine points of size 1 << (c-1) type ibG1Affine interface { @@ -493,8 +493,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketG2AffineC10 [1 << (10 - 1)]G2Affine -type bucketG2AffineC16 [1 << (16 - 1)]G2Affine +type bucketG2AffineC10 [512]G2Affine +type bucketG2AffineC16 [32768]G2Affine // buckets: array of G2Affine points of size 1 << (c-1) type ibG2Affine interface { @@ -538,13 +538,13 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine -type bitSetC2 [1 << (2 - 1)]bool -type bitSetC3 [1 << (3 - 1)]bool -type bitSetC4 [1 << (4 - 1)]bool -type bitSetC5 [1 << (5 - 1)]bool -type bitSetC8 [1 << (8 - 1)]bool -type bitSetC10 [1 << (10 - 1)]bool -type bitSetC16 [1 << (16 - 1)]bool +type bitSetC2 [2]bool +type bitSetC3 [4]bool +type bitSetC4 [8]bool +type bitSetC5 [16]bool +type bitSetC8 [128]bool +type bitSetC10 [512]bool +type bitSetC16 [32768]bool type bitSet interface { bitSetC2 | diff --git a/ecc/bw6-761/multiexp_jacobian.go b/ecc/bw6-761/multiexp_jacobian.go index 59edd2d1bd..ca6b1610da 100644 --- a/ecc/bw6-761/multiexp_jacobian.go +++ b/ecc/bw6-761/multiexp_jacobian.go @@ -61,13 +61,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended -type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended -type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended -type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended -type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended -type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended -type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended +type bucketg1JacExtendedC2 [2]g1JacExtended +type bucketg1JacExtendedC3 [4]g1JacExtended +type bucketg1JacExtendedC4 [8]g1JacExtended +type bucketg1JacExtendedC5 [16]g1JacExtended +type bucketg1JacExtendedC8 [128]g1JacExtended +type bucketg1JacExtendedC10 [512]g1JacExtended +type bucketg1JacExtendedC16 [32768]g1JacExtended type ibg1JacExtended interface { bucketg1JacExtendedC2 | @@ -124,13 +124,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended -type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended -type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended -type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended -type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended -type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended -type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended +type bucketg2JacExtendedC2 [2]g2JacExtended +type bucketg2JacExtendedC3 [4]g2JacExtended +type bucketg2JacExtendedC4 [8]g2JacExtended +type bucketg2JacExtendedC5 [16]g2JacExtended +type bucketg2JacExtendedC8 [128]g2JacExtended +type bucketg2JacExtendedC10 [512]g2JacExtended +type bucketg2JacExtendedC16 [32768]g2JacExtended type ibg2JacExtended interface { bucketg2JacExtendedC2 | diff --git a/internal/generator/ecc/generate.go b/internal/generator/ecc/generate.go index 9af8b0dd15..2dce39800b 100644 --- a/internal/generator/ecc/generate.go +++ b/internal/generator/ecc/generate.go @@ -69,6 +69,10 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er funcs["lastC"] = lastC funcs["batchSize"] = batchSize + funcs["nbBuckets"] = func(c int) int { + return 1 << (c - 1) + } + funcs["contains"] = func(v int, s []int) bool { for _, sv := range s { if v == sv { diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index a3c609910f..23baf3d16d 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -239,7 +239,7 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B // this allow us to allocate the buckets on the stack {{- range $c := $.CRange}} {{- if gt $c 9}} -type bucket{{ $.TAffine }}C{{$c}} [1<<({{$c}}-1)]{{ $.TAffine }} +type bucket{{ $.TAffine }}C{{$c}} [{{nbBuckets $c}}]{{ $.TAffine }} {{- end}} {{- end}} @@ -304,7 +304,7 @@ type q{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]batchOp{{ $.TAffine }} {{end }} {{- range $c := $.G1.CRange}} -type bitSetC{{$c}} [1<<({{$c}}-1)]bool +type bitSetC{{$c}} [{{nbBuckets $c}}]bool {{- end}} type bitSet interface { diff --git a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl index 3fd44311bc..166d185faa 100644 --- a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl +++ b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl @@ -64,7 +64,7 @@ func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack {{- range $c := $.CRange}} -type bucket{{ $.TJacobianExtended }}C{{$c}} [1<<({{$c}}-1)]{{ $.TJacobianExtended }} +type bucket{{ $.TJacobianExtended }}C{{$c}} [{{nbBuckets $c}}]{{ $.TJacobianExtended }} {{- end}} type ib{{ $.TJacobianExtended }} interface { From 0eb6955ebd0acb5c1ee4a8c1f147918247d4a36d Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Wed, 16 Nov 2022 21:23:13 -0600 Subject: [PATCH 38/43] feat: deal with doubling edge case using other set of buckets --- ecc/bls12-377/multiexp_affine.go | 22 +++++++------------ ecc/bls12-378/multiexp_affine.go | 22 +++++++------------ ecc/bls12-381/multiexp_affine.go | 22 +++++++------------ ecc/bls24-315/multiexp_affine.go | 22 +++++++------------ ecc/bls24-317/multiexp_affine.go | 22 +++++++------------ ecc/bn254/multiexp_affine.go | 22 +++++++------------ ecc/bw6-633/multiexp_affine.go | 22 +++++++------------ ecc/bw6-756/multiexp_affine.go | 22 +++++++------------ ecc/bw6-761/multiexp_affine.go | 22 +++++++------------ .../ecc/template/multiexp_affine.go.tmpl | 11 ++++------ 10 files changed, 76 insertions(+), 133 deletions(-) diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index 331f283ede..df95b34370 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -97,9 +97,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if BK.X.Equal(&op.point.X) { if BK.Y.Equal(&op.point.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) + // we use the other set of buckets + bucketsJE[op.bucketID].addMixed(&op.point) return } BK.setInfinity() @@ -127,10 +126,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? if isAdd { - BK.Add(BK, BK) + bucketsJE[bucketID].addMixed(PP) } else { BK.setInfinity() } @@ -139,7 +136,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if isAdd { BK.setInfinity() } else { - BK.Add(BK, BK) + bucketsJE[bucketID].subMixed(PP) } return } @@ -416,9 +413,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if BK.X.Equal(&op.point.X) { if BK.Y.Equal(&op.point.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) + // we use the other set of buckets + bucketsJE[op.bucketID].addMixed(&op.point) return } BK.setInfinity() @@ -446,10 +442,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? if isAdd { - BK.Add(BK, BK) + bucketsJE[bucketID].addMixed(PP) } else { BK.setInfinity() } @@ -458,7 +452,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if isAdd { BK.setInfinity() } else { - BK.Add(BK, BK) + bucketsJE[bucketID].subMixed(PP) } return } diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go index 0f65e1838d..28301102cf 100644 --- a/ecc/bls12-378/multiexp_affine.go +++ b/ecc/bls12-378/multiexp_affine.go @@ -97,9 +97,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if BK.X.Equal(&op.point.X) { if BK.Y.Equal(&op.point.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) + // we use the other set of buckets + bucketsJE[op.bucketID].addMixed(&op.point) return } BK.setInfinity() @@ -127,10 +126,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? if isAdd { - BK.Add(BK, BK) + bucketsJE[bucketID].addMixed(PP) } else { BK.setInfinity() } @@ -139,7 +136,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if isAdd { BK.setInfinity() } else { - BK.Add(BK, BK) + bucketsJE[bucketID].subMixed(PP) } return } @@ -416,9 +413,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if BK.X.Equal(&op.point.X) { if BK.Y.Equal(&op.point.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) + // we use the other set of buckets + bucketsJE[op.bucketID].addMixed(&op.point) return } BK.setInfinity() @@ -446,10 +442,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? if isAdd { - BK.Add(BK, BK) + bucketsJE[bucketID].addMixed(PP) } else { BK.setInfinity() } @@ -458,7 +452,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if isAdd { BK.setInfinity() } else { - BK.Add(BK, BK) + bucketsJE[bucketID].subMixed(PP) } return } diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index c566026823..66f1e361a1 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -97,9 +97,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if BK.X.Equal(&op.point.X) { if BK.Y.Equal(&op.point.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) + // we use the other set of buckets + bucketsJE[op.bucketID].addMixed(&op.point) return } BK.setInfinity() @@ -127,10 +126,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? if isAdd { - BK.Add(BK, BK) + bucketsJE[bucketID].addMixed(PP) } else { BK.setInfinity() } @@ -139,7 +136,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if isAdd { BK.setInfinity() } else { - BK.Add(BK, BK) + bucketsJE[bucketID].subMixed(PP) } return } @@ -416,9 +413,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if BK.X.Equal(&op.point.X) { if BK.Y.Equal(&op.point.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) + // we use the other set of buckets + bucketsJE[op.bucketID].addMixed(&op.point) return } BK.setInfinity() @@ -446,10 +442,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? if isAdd { - BK.Add(BK, BK) + bucketsJE[bucketID].addMixed(PP) } else { BK.setInfinity() } @@ -458,7 +452,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if isAdd { BK.setInfinity() } else { - BK.Add(BK, BK) + bucketsJE[bucketID].subMixed(PP) } return } diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index 4bf2f7f50c..5db6512603 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -97,9 +97,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if BK.X.Equal(&op.point.X) { if BK.Y.Equal(&op.point.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) + // we use the other set of buckets + bucketsJE[op.bucketID].addMixed(&op.point) return } BK.setInfinity() @@ -127,10 +126,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? if isAdd { - BK.Add(BK, BK) + bucketsJE[bucketID].addMixed(PP) } else { BK.setInfinity() } @@ -139,7 +136,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if isAdd { BK.setInfinity() } else { - BK.Add(BK, BK) + bucketsJE[bucketID].subMixed(PP) } return } @@ -416,9 +413,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if BK.X.Equal(&op.point.X) { if BK.Y.Equal(&op.point.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) + // we use the other set of buckets + bucketsJE[op.bucketID].addMixed(&op.point) return } BK.setInfinity() @@ -446,10 +442,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? if isAdd { - BK.Add(BK, BK) + bucketsJE[bucketID].addMixed(PP) } else { BK.setInfinity() } @@ -458,7 +452,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if isAdd { BK.setInfinity() } else { - BK.Add(BK, BK) + bucketsJE[bucketID].subMixed(PP) } return } diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index 5cc17e3cc8..37050a251b 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -97,9 +97,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if BK.X.Equal(&op.point.X) { if BK.Y.Equal(&op.point.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) + // we use the other set of buckets + bucketsJE[op.bucketID].addMixed(&op.point) return } BK.setInfinity() @@ -127,10 +126,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? if isAdd { - BK.Add(BK, BK) + bucketsJE[bucketID].addMixed(PP) } else { BK.setInfinity() } @@ -139,7 +136,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if isAdd { BK.setInfinity() } else { - BK.Add(BK, BK) + bucketsJE[bucketID].subMixed(PP) } return } @@ -416,9 +413,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if BK.X.Equal(&op.point.X) { if BK.Y.Equal(&op.point.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) + // we use the other set of buckets + bucketsJE[op.bucketID].addMixed(&op.point) return } BK.setInfinity() @@ -446,10 +442,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? if isAdd { - BK.Add(BK, BK) + bucketsJE[bucketID].addMixed(PP) } else { BK.setInfinity() } @@ -458,7 +452,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if isAdd { BK.setInfinity() } else { - BK.Add(BK, BK) + bucketsJE[bucketID].subMixed(PP) } return } diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go index d38581dbb6..c0fd33431e 100644 --- a/ecc/bn254/multiexp_affine.go +++ b/ecc/bn254/multiexp_affine.go @@ -97,9 +97,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if BK.X.Equal(&op.point.X) { if BK.Y.Equal(&op.point.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) + // we use the other set of buckets + bucketsJE[op.bucketID].addMixed(&op.point) return } BK.setInfinity() @@ -127,10 +126,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? if isAdd { - BK.Add(BK, BK) + bucketsJE[bucketID].addMixed(PP) } else { BK.setInfinity() } @@ -139,7 +136,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if isAdd { BK.setInfinity() } else { - BK.Add(BK, BK) + bucketsJE[bucketID].subMixed(PP) } return } @@ -416,9 +413,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if BK.X.Equal(&op.point.X) { if BK.Y.Equal(&op.point.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) + // we use the other set of buckets + bucketsJE[op.bucketID].addMixed(&op.point) return } BK.setInfinity() @@ -446,10 +442,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? if isAdd { - BK.Add(BK, BK) + bucketsJE[bucketID].addMixed(PP) } else { BK.setInfinity() } @@ -458,7 +452,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if isAdd { BK.setInfinity() } else { - BK.Add(BK, BK) + bucketsJE[bucketID].subMixed(PP) } return } diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index d05f2ce04f..f7189e2c35 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -96,9 +96,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if BK.X.Equal(&op.point.X) { if BK.Y.Equal(&op.point.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) + // we use the other set of buckets + bucketsJE[op.bucketID].addMixed(&op.point) return } BK.setInfinity() @@ -126,10 +125,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? if isAdd { - BK.Add(BK, BK) + bucketsJE[bucketID].addMixed(PP) } else { BK.setInfinity() } @@ -138,7 +135,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if isAdd { BK.setInfinity() } else { - BK.Add(BK, BK) + bucketsJE[bucketID].subMixed(PP) } return } @@ -355,9 +352,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if BK.X.Equal(&op.point.X) { if BK.Y.Equal(&op.point.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) + // we use the other set of buckets + bucketsJE[op.bucketID].addMixed(&op.point) return } BK.setInfinity() @@ -385,10 +381,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? if isAdd { - BK.Add(BK, BK) + bucketsJE[bucketID].addMixed(PP) } else { BK.setInfinity() } @@ -397,7 +391,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if isAdd { BK.setInfinity() } else { - BK.Add(BK, BK) + bucketsJE[bucketID].subMixed(PP) } return } diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go index 968db46e6f..086c2e9f83 100644 --- a/ecc/bw6-756/multiexp_affine.go +++ b/ecc/bw6-756/multiexp_affine.go @@ -96,9 +96,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if BK.X.Equal(&op.point.X) { if BK.Y.Equal(&op.point.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) + // we use the other set of buckets + bucketsJE[op.bucketID].addMixed(&op.point) return } BK.setInfinity() @@ -126,10 +125,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? if isAdd { - BK.Add(BK, BK) + bucketsJE[bucketID].addMixed(PP) } else { BK.setInfinity() } @@ -138,7 +135,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if isAdd { BK.setInfinity() } else { - BK.Add(BK, BK) + bucketsJE[bucketID].subMixed(PP) } return } @@ -355,9 +352,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if BK.X.Equal(&op.point.X) { if BK.Y.Equal(&op.point.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) + // we use the other set of buckets + bucketsJE[op.bucketID].addMixed(&op.point) return } BK.setInfinity() @@ -385,10 +381,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? if isAdd { - BK.Add(BK, BK) + bucketsJE[bucketID].addMixed(PP) } else { BK.setInfinity() } @@ -397,7 +391,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if isAdd { BK.setInfinity() } else { - BK.Add(BK, BK) + bucketsJE[bucketID].subMixed(PP) } return } diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go index 91750cd328..5f423838c4 100644 --- a/ecc/bw6-761/multiexp_affine.go +++ b/ecc/bw6-761/multiexp_affine.go @@ -96,9 +96,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if BK.X.Equal(&op.point.X) { if BK.Y.Equal(&op.point.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) + // we use the other set of buckets + bucketsJE[op.bucketID].addMixed(&op.point) return } BK.setInfinity() @@ -126,10 +125,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? if isAdd { - BK.Add(BK, BK) + bucketsJE[bucketID].addMixed(PP) } else { BK.setInfinity() } @@ -138,7 +135,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP if isAdd { BK.setInfinity() } else { - BK.Add(BK, BK) + bucketsJE[bucketID].subMixed(PP) } return } @@ -355,9 +352,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if BK.X.Equal(&op.point.X) { if BK.Y.Equal(&op.point.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) + // we use the other set of buckets + bucketsJE[op.bucketID].addMixed(&op.point) return } BK.setInfinity() @@ -385,10 +381,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? if isAdd { - BK.Add(BK, BK) + bucketsJE[bucketID].addMixed(PP) } else { BK.setInfinity() } @@ -397,7 +391,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP if isAdd { BK.setInfinity() } else { - BK.Add(BK, BK) + bucketsJE[bucketID].subMixed(PP) } return } diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl index 23baf3d16d..979d05c00a 100644 --- a/internal/generator/ecc/template/multiexp_affine.go.tmpl +++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl @@ -96,9 +96,8 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B if BK.X.Equal(&op.point.X) { if BK.Y.Equal(&op.point.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? - BK.Add(BK, BK) + // we use the other set of buckets + bucketsJE[op.bucketID].addMixed(&op.point) return } BK.setInfinity() @@ -126,10 +125,8 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B if BK.X.Equal(&PP.X) { if BK.Y.Equal(&PP.Y) { // P + P: doubling, which should be quite rare -- - // TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests. - // need doubling in affine implemented ? if isAdd { - BK.Add(BK, BK) + bucketsJE[bucketID].addMixed(PP) } else { BK.setInfinity() } @@ -138,7 +135,7 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B if isAdd { BK.setInfinity() } else { - BK.Add(BK, BK) + bucketsJE[bucketID].subMixed(PP) } return } From c1ec769268b0cfd298e4280ebe1b961224b75066 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Wed, 16 Nov 2022 21:29:02 -0600 Subject: [PATCH 39/43] test: add some doublings in msm test --- ecc/bls12-377/multiexp_test.go | 12 ++++++++++++ ecc/bls12-378/multiexp_test.go | 12 ++++++++++++ ecc/bls12-381/multiexp_test.go | 12 ++++++++++++ ecc/bls24-315/multiexp_test.go | 12 ++++++++++++ ecc/bls24-317/multiexp_test.go | 12 ++++++++++++ ecc/bn254/multiexp_test.go | 12 ++++++++++++ ecc/bw6-633/multiexp_test.go | 12 ++++++++++++ ecc/bw6-756/multiexp_test.go | 12 ++++++++++++ ecc/bw6-761/multiexp_test.go | 12 ++++++++++++ .../generator/ecc/template/tests/multiexp.go.tmpl | 7 +++++++ 10 files changed, 115 insertions(+) diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go index f2487e2edc..eaf9952de7 100644 --- a/ecc/bls12-377/multiexp_test.go +++ b/ecc/bls12-377/multiexp_test.go @@ -256,6 +256,12 @@ func TestCrossMultiExpG1(t *testing.T) { var sampleScalars [nbSamples]fr.Element fillBenchScalars(sampleScalars[:]) + // sprinkle some doublings + for i := 10; i < 100; i++ { + samplePoints[i] = samplePoints[0] + sampleScalars[i] = sampleScalars[0] + } + // cRange is generated from template and contains the available parameters for the multiexp window size cRange := []uint64{2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { @@ -668,6 +674,12 @@ func TestCrossMultiExpG2(t *testing.T) { var sampleScalars [nbSamples]fr.Element fillBenchScalars(sampleScalars[:]) + // sprinkle some doublings + for i := 10; i < 100; i++ { + samplePoints[i] = samplePoints[0] + sampleScalars[i] = sampleScalars[0] + } + // cRange is generated from template and contains the available parameters for the multiexp window size // for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets. // test only "odd" and "even" (ie windows size divide word size vs not) diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go index 55524da71e..7fcb05040a 100644 --- a/ecc/bls12-378/multiexp_test.go +++ b/ecc/bls12-378/multiexp_test.go @@ -256,6 +256,12 @@ func TestCrossMultiExpG1(t *testing.T) { var sampleScalars [nbSamples]fr.Element fillBenchScalars(sampleScalars[:]) + // sprinkle some doublings + for i := 10; i < 100; i++ { + samplePoints[i] = samplePoints[0] + sampleScalars[i] = sampleScalars[0] + } + // cRange is generated from template and contains the available parameters for the multiexp window size cRange := []uint64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { @@ -668,6 +674,12 @@ func TestCrossMultiExpG2(t *testing.T) { var sampleScalars [nbSamples]fr.Element fillBenchScalars(sampleScalars[:]) + // sprinkle some doublings + for i := 10; i < 100; i++ { + samplePoints[i] = samplePoints[0] + sampleScalars[i] = sampleScalars[0] + } + // cRange is generated from template and contains the available parameters for the multiexp window size // for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets. // test only "odd" and "even" (ie windows size divide word size vs not) diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go index 8d96b5c59e..4b013c619a 100644 --- a/ecc/bls12-381/multiexp_test.go +++ b/ecc/bls12-381/multiexp_test.go @@ -256,6 +256,12 @@ func TestCrossMultiExpG1(t *testing.T) { var sampleScalars [nbSamples]fr.Element fillBenchScalars(sampleScalars[:]) + // sprinkle some doublings + for i := 10; i < 100; i++ { + samplePoints[i] = samplePoints[0] + sampleScalars[i] = sampleScalars[0] + } + // cRange is generated from template and contains the available parameters for the multiexp window size cRange := []uint64{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { @@ -668,6 +674,12 @@ func TestCrossMultiExpG2(t *testing.T) { var sampleScalars [nbSamples]fr.Element fillBenchScalars(sampleScalars[:]) + // sprinkle some doublings + for i := 10; i < 100; i++ { + samplePoints[i] = samplePoints[0] + sampleScalars[i] = sampleScalars[0] + } + // cRange is generated from template and contains the available parameters for the multiexp window size // for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets. // test only "odd" and "even" (ie windows size divide word size vs not) diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go index 4e67c67761..c86dfdad0d 100644 --- a/ecc/bls24-315/multiexp_test.go +++ b/ecc/bls24-315/multiexp_test.go @@ -256,6 +256,12 @@ func TestCrossMultiExpG1(t *testing.T) { var sampleScalars [nbSamples]fr.Element fillBenchScalars(sampleScalars[:]) + // sprinkle some doublings + for i := 10; i < 100; i++ { + samplePoints[i] = samplePoints[0] + sampleScalars[i] = sampleScalars[0] + } + // cRange is generated from template and contains the available parameters for the multiexp window size cRange := []uint64{2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { @@ -668,6 +674,12 @@ func TestCrossMultiExpG2(t *testing.T) { var sampleScalars [nbSamples]fr.Element fillBenchScalars(sampleScalars[:]) + // sprinkle some doublings + for i := 10; i < 100; i++ { + samplePoints[i] = samplePoints[0] + sampleScalars[i] = sampleScalars[0] + } + // cRange is generated from template and contains the available parameters for the multiexp window size // for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets. // test only "odd" and "even" (ie windows size divide word size vs not) diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go index 33e7c834c5..8287221bbd 100644 --- a/ecc/bls24-317/multiexp_test.go +++ b/ecc/bls24-317/multiexp_test.go @@ -256,6 +256,12 @@ func TestCrossMultiExpG1(t *testing.T) { var sampleScalars [nbSamples]fr.Element fillBenchScalars(sampleScalars[:]) + // sprinkle some doublings + for i := 10; i < 100; i++ { + samplePoints[i] = samplePoints[0] + sampleScalars[i] = sampleScalars[0] + } + // cRange is generated from template and contains the available parameters for the multiexp window size cRange := []uint64{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { @@ -668,6 +674,12 @@ func TestCrossMultiExpG2(t *testing.T) { var sampleScalars [nbSamples]fr.Element fillBenchScalars(sampleScalars[:]) + // sprinkle some doublings + for i := 10; i < 100; i++ { + samplePoints[i] = samplePoints[0] + sampleScalars[i] = sampleScalars[0] + } + // cRange is generated from template and contains the available parameters for the multiexp window size // for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets. // test only "odd" and "even" (ie windows size divide word size vs not) diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go index 3307840f6a..e64b8c9c32 100644 --- a/ecc/bn254/multiexp_test.go +++ b/ecc/bn254/multiexp_test.go @@ -256,6 +256,12 @@ func TestCrossMultiExpG1(t *testing.T) { var sampleScalars [nbSamples]fr.Element fillBenchScalars(sampleScalars[:]) + // sprinkle some doublings + for i := 10; i < 100; i++ { + samplePoints[i] = samplePoints[0] + sampleScalars[i] = sampleScalars[0] + } + // cRange is generated from template and contains the available parameters for the multiexp window size cRange := []uint64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} if testing.Short() { @@ -668,6 +674,12 @@ func TestCrossMultiExpG2(t *testing.T) { var sampleScalars [nbSamples]fr.Element fillBenchScalars(sampleScalars[:]) + // sprinkle some doublings + for i := 10; i < 100; i++ { + samplePoints[i] = samplePoints[0] + sampleScalars[i] = sampleScalars[0] + } + // cRange is generated from template and contains the available parameters for the multiexp window size // for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets. // test only "odd" and "even" (ie windows size divide word size vs not) diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go index dc7ef60c2c..242ccf1fc0 100644 --- a/ecc/bw6-633/multiexp_test.go +++ b/ecc/bw6-633/multiexp_test.go @@ -256,6 +256,12 @@ func TestCrossMultiExpG1(t *testing.T) { var sampleScalars [nbSamples]fr.Element fillBenchScalars(sampleScalars[:]) + // sprinkle some doublings + for i := 10; i < 100; i++ { + samplePoints[i] = samplePoints[0] + sampleScalars[i] = sampleScalars[0] + } + // cRange is generated from template and contains the available parameters for the multiexp window size cRange := []uint64{4, 5, 8, 12, 16} if testing.Short() { @@ -668,6 +674,12 @@ func TestCrossMultiExpG2(t *testing.T) { var sampleScalars [nbSamples]fr.Element fillBenchScalars(sampleScalars[:]) + // sprinkle some doublings + for i := 10; i < 100; i++ { + samplePoints[i] = samplePoints[0] + sampleScalars[i] = sampleScalars[0] + } + // cRange is generated from template and contains the available parameters for the multiexp window size // for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets. // test only "odd" and "even" (ie windows size divide word size vs not) diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go index 308efca4c9..70643ca8f5 100644 --- a/ecc/bw6-756/multiexp_test.go +++ b/ecc/bw6-756/multiexp_test.go @@ -256,6 +256,12 @@ func TestCrossMultiExpG1(t *testing.T) { var sampleScalars [nbSamples]fr.Element fillBenchScalars(sampleScalars[:]) + // sprinkle some doublings + for i := 10; i < 100; i++ { + samplePoints[i] = samplePoints[0] + sampleScalars[i] = sampleScalars[0] + } + // cRange is generated from template and contains the available parameters for the multiexp window size cRange := []uint64{3, 4, 5, 8, 11, 16} if testing.Short() { @@ -668,6 +674,12 @@ func TestCrossMultiExpG2(t *testing.T) { var sampleScalars [nbSamples]fr.Element fillBenchScalars(sampleScalars[:]) + // sprinkle some doublings + for i := 10; i < 100; i++ { + samplePoints[i] = samplePoints[0] + sampleScalars[i] = sampleScalars[0] + } + // cRange is generated from template and contains the available parameters for the multiexp window size // for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets. // test only "odd" and "even" (ie windows size divide word size vs not) diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go index 98560005bc..3532346ae9 100644 --- a/ecc/bw6-761/multiexp_test.go +++ b/ecc/bw6-761/multiexp_test.go @@ -256,6 +256,12 @@ func TestCrossMultiExpG1(t *testing.T) { var sampleScalars [nbSamples]fr.Element fillBenchScalars(sampleScalars[:]) + // sprinkle some doublings + for i := 10; i < 100; i++ { + samplePoints[i] = samplePoints[0] + sampleScalars[i] = sampleScalars[0] + } + // cRange is generated from template and contains the available parameters for the multiexp window size cRange := []uint64{2, 3, 4, 5, 8, 10, 16} if testing.Short() { @@ -668,6 +674,12 @@ func TestCrossMultiExpG2(t *testing.T) { var sampleScalars [nbSamples]fr.Element fillBenchScalars(sampleScalars[:]) + // sprinkle some doublings + for i := 10; i < 100; i++ { + samplePoints[i] = samplePoints[0] + sampleScalars[i] = sampleScalars[0] + } + // cRange is generated from template and contains the available parameters for the multiexp window size // for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets. // test only "odd" and "even" (ie windows size divide word size vs not) diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl index d99a83a49f..d929eb0334 100644 --- a/internal/generator/ecc/template/tests/multiexp.go.tmpl +++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl @@ -265,9 +265,16 @@ func TestCrossMultiExp{{ $.UPointName }}(t *testing.T) { samplePoints[rand.Intn(nbSamples)].setInfinity() samplePoints[rand.Intn(nbSamples)].setInfinity() + var sampleScalars [nbSamples]fr.Element fillBenchScalars(sampleScalars[:]) + // sprinkle some doublings + for i:=10; i < 100; i++ { + samplePoints[i] = samplePoints[0] + sampleScalars[i] = sampleScalars[0] + } + // cRange is generated from template and contains the available parameters for the multiexp window size {{- if eq $.PointName "g1" }} cRange := []uint64{ From 4dbc3643a0e565e0d83716f22aeeae238fea8d25 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Thu, 17 Nov 2022 13:09:17 -0600 Subject: [PATCH 40/43] fix: msm partitionScalar - handle edge cases with carry --- ecc/bls12-377/multiexp.go | 48 ++++++++++------ ecc/bls12-377/multiexp_affine.go | 4 +- ecc/bls12-377/multiexp_jacobian.go | 8 +-- ecc/bls12-378/multiexp.go | 44 ++++++++++----- ecc/bls12-381/multiexp.go | 48 ++++++++++------ ecc/bls12-381/multiexp_affine.go | 4 +- ecc/bls12-381/multiexp_jacobian.go | 8 +-- ecc/bls24-315/multiexp.go | 48 ++++++++++------ ecc/bls24-315/multiexp_affine.go | 4 +- ecc/bls24-315/multiexp_jacobian.go | 8 +-- ecc/bls24-317/multiexp.go | 48 ++++++++++------ ecc/bls24-317/multiexp_affine.go | 4 +- ecc/bls24-317/multiexp_jacobian.go | 8 +-- ecc/bn254/multiexp.go | 44 ++++++++++----- ecc/bw6-633/multiexp.go | 48 ++++++++++------ ecc/bw6-633/multiexp_affine.go | 4 +- ecc/bw6-633/multiexp_jacobian.go | 8 +-- ecc/bw6-756/multiexp.go | 44 ++++++++++----- ecc/bw6-761/multiexp.go | 44 ++++++++++----- internal/field/field.go | 18 +++++- internal/generator/ecc/generate.go | 55 +++++++++++++------ .../generator/ecc/template/multiexp.go.tmpl | 44 ++++++++++----- 22 files changed, 370 insertions(+), 223 deletions(-) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index 00f3a97050..c81bd397ad 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -183,8 +183,6 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { - case 1: - return processChunkG1Jacobian[bucketg1JacExtendedC1] case 2: return processChunkG1Jacobian[bucketg1JacExtendedC2] case 4: @@ -444,8 +442,6 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { - case 1: - return processChunkG2Jacobian[bucketg2JacExtendedC1] case 2: return processChunkG2Jacobian[bucketg2JacExtendedC2] case 4: @@ -560,25 +556,27 @@ type selector struct { } // return number of chunks for a given window size c +// the last chunk may be bigger to accomodate a potential carry from the NAF decomposition func computeNbChunks(c uint64) uint64 { - // note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF - // decomposition in partitionScalars - nbChunks := (fr.Bits + 1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } - return nbChunks + return (fr.Bits + c - 1) / c } // return the last window size for a scalar; if c divides the scalar size // then it returns c // if not, returns lastC << c func lastC(c uint64) uint64 { - const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition - if n%c == 0 { - return c + nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits + if nbAvailableBits == 0 { + // we can push a bit the edge case here; + // if the c-msb bits of modulus are not all ones, we have space for the carry + // (assuming inputs are smaller than modulus) + const qMsb16 = 0b1001010101011011 + msbC := qMsb16 >> (16 - c) + if !(msbC&((1<= max { + if digit > max { digit -= (1 << c) carry = 1 } @@ -679,6 +677,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } digits[int(chunk)*len(scalars)+i] = bits } + + // for the last chunk, we don't want to borrow from a next window + // (but may have a larger max value) + chunk := nbChunks - 1 + s := selectors[chunk] + // init with carry if any + digit := carry + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1 } }, nbTasks) diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go index df95b34370..5aa3546b5e 100644 --- a/ecc/bls12-377/multiexp_affine.go +++ b/ecc/bls12-377/multiexp_affine.go @@ -653,7 +653,6 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine -type bitSetC1 [1]bool type bitSetC2 [2]bool type bitSetC4 [8]bool type bitSetC5 [16]bool @@ -670,8 +669,7 @@ type bitSetC15 [16384]bool type bitSetC16 [32768]bool type bitSet interface { - bitSetC1 | - bitSetC2 | + bitSetC2 | bitSetC4 | bitSetC5 | bitSetC6 | diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go index f34d5ff332..e01f5567f5 100644 --- a/ecc/bls12-377/multiexp_jacobian.go +++ b/ecc/bls12-377/multiexp_jacobian.go @@ -61,7 +61,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg1JacExtendedC1 [1]g1JacExtended type bucketg1JacExtendedC2 [2]g1JacExtended type bucketg1JacExtendedC4 [8]g1JacExtended type bucketg1JacExtendedC5 [16]g1JacExtended @@ -78,8 +77,7 @@ type bucketg1JacExtendedC15 [16384]g1JacExtended type bucketg1JacExtendedC16 [32768]g1JacExtended type ibg1JacExtended interface { - bucketg1JacExtendedC1 | - bucketg1JacExtendedC2 | + bucketg1JacExtendedC2 | bucketg1JacExtendedC4 | bucketg1JacExtendedC5 | bucketg1JacExtendedC6 | @@ -140,7 +138,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg2JacExtendedC1 [1]g2JacExtended type bucketg2JacExtendedC2 [2]g2JacExtended type bucketg2JacExtendedC4 [8]g2JacExtended type bucketg2JacExtendedC5 [16]g2JacExtended @@ -157,8 +154,7 @@ type bucketg2JacExtendedC15 [16384]g2JacExtended type bucketg2JacExtendedC16 [32768]g2JacExtended type ibg2JacExtended interface { - bucketg2JacExtendedC1 | - bucketg2JacExtendedC2 | + bucketg2JacExtendedC2 | bucketg2JacExtendedC4 | bucketg2JacExtendedC5 | bucketg2JacExtendedC6 | diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index 133242f8af..8e710a39c2 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -560,25 +560,27 @@ type selector struct { } // return number of chunks for a given window size c +// the last chunk may be bigger to accomodate a potential carry from the NAF decomposition func computeNbChunks(c uint64) uint64 { - // note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF - // decomposition in partitionScalars - nbChunks := (fr.Bits + 1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } - return nbChunks + return (fr.Bits + c - 1) / c } // return the last window size for a scalar; if c divides the scalar size // then it returns c // if not, returns lastC << c func lastC(c uint64) uint64 { - const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition - if n%c == 0 { - return c + nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits + if nbAvailableBits == 0 { + // we can push a bit the edge case here; + // if the c-msb bits of modulus are not all ones, we have space for the carry + // (assuming inputs are smaller than modulus) + const qMsb16 = 0b1000001110011110 + msbC := qMsb16 >> (16 - c) + if !(msbC&((1<= max { + if digit > max { digit -= (1 << c) carry = 1 } @@ -679,6 +681,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } digits[int(chunk)*len(scalars)+i] = bits } + + // for the last chunk, we don't want to borrow from a next window + // (but may have a larger max value) + chunk := nbChunks - 1 + s := selectors[chunk] + // init with carry if any + digit := carry + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1 } }, nbTasks) diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index 60e4686759..173d99e6ee 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -183,8 +183,6 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { - case 1: - return processChunkG1Jacobian[bucketg1JacExtendedC1] case 3: return processChunkG1Jacobian[bucketg1JacExtendedC3] case 4: @@ -444,8 +442,6 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { - case 1: - return processChunkG2Jacobian[bucketg2JacExtendedC1] case 3: return processChunkG2Jacobian[bucketg2JacExtendedC3] case 4: @@ -560,25 +556,27 @@ type selector struct { } // return number of chunks for a given window size c +// the last chunk may be bigger to accomodate a potential carry from the NAF decomposition func computeNbChunks(c uint64) uint64 { - // note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF - // decomposition in partitionScalars - nbChunks := (fr.Bits + 1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } - return nbChunks + return (fr.Bits + c - 1) / c } // return the last window size for a scalar; if c divides the scalar size // then it returns c // if not, returns lastC << c func lastC(c uint64) uint64 { - const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition - if n%c == 0 { - return c + nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits + if nbAvailableBits == 0 { + // we can push a bit the edge case here; + // if the c-msb bits of modulus are not all ones, we have space for the carry + // (assuming inputs are smaller than modulus) + const qMsb16 = 0b1110011111011011 + msbC := qMsb16 >> (16 - c) + if !(msbC&((1<= max { + if digit > max { digit -= (1 << c) carry = 1 } @@ -679,6 +677,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } digits[int(chunk)*len(scalars)+i] = bits } + + // for the last chunk, we don't want to borrow from a next window + // (but may have a larger max value) + chunk := nbChunks - 1 + s := selectors[chunk] + // init with carry if any + digit := carry + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1 } }, nbTasks) diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go index 66f1e361a1..f2fcc05732 100644 --- a/ecc/bls12-381/multiexp_affine.go +++ b/ecc/bls12-381/multiexp_affine.go @@ -653,7 +653,6 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine -type bitSetC1 [1]bool type bitSetC3 [4]bool type bitSetC4 [8]bool type bitSetC5 [16]bool @@ -670,8 +669,7 @@ type bitSetC15 [16384]bool type bitSetC16 [32768]bool type bitSet interface { - bitSetC1 | - bitSetC3 | + bitSetC3 | bitSetC4 | bitSetC5 | bitSetC6 | diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go index 55cdd766b5..2a2f8caa85 100644 --- a/ecc/bls12-381/multiexp_jacobian.go +++ b/ecc/bls12-381/multiexp_jacobian.go @@ -61,7 +61,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg1JacExtendedC1 [1]g1JacExtended type bucketg1JacExtendedC3 [4]g1JacExtended type bucketg1JacExtendedC4 [8]g1JacExtended type bucketg1JacExtendedC5 [16]g1JacExtended @@ -78,8 +77,7 @@ type bucketg1JacExtendedC15 [16384]g1JacExtended type bucketg1JacExtendedC16 [32768]g1JacExtended type ibg1JacExtended interface { - bucketg1JacExtendedC1 | - bucketg1JacExtendedC3 | + bucketg1JacExtendedC3 | bucketg1JacExtendedC4 | bucketg1JacExtendedC5 | bucketg1JacExtendedC6 | @@ -140,7 +138,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg2JacExtendedC1 [1]g2JacExtended type bucketg2JacExtendedC3 [4]g2JacExtended type bucketg2JacExtendedC4 [8]g2JacExtended type bucketg2JacExtendedC5 [16]g2JacExtended @@ -157,8 +154,7 @@ type bucketg2JacExtendedC15 [16384]g2JacExtended type bucketg2JacExtendedC16 [32768]g2JacExtended type ibg2JacExtended interface { - bucketg2JacExtendedC1 | - bucketg2JacExtendedC3 | + bucketg2JacExtendedC3 | bucketg2JacExtendedC4 | bucketg2JacExtendedC5 | bucketg2JacExtendedC6 | diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index 85c2a14d17..488bdd5837 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -183,8 +183,6 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { - case 1: - return processChunkG1Jacobian[bucketg1JacExtendedC1] case 2: return processChunkG1Jacobian[bucketg1JacExtendedC2] case 4: @@ -444,8 +442,6 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { - case 1: - return processChunkG2Jacobian[bucketg2JacExtendedC1] case 2: return processChunkG2Jacobian[bucketg2JacExtendedC2] case 4: @@ -560,25 +556,27 @@ type selector struct { } // return number of chunks for a given window size c +// the last chunk may be bigger to accomodate a potential carry from the NAF decomposition func computeNbChunks(c uint64) uint64 { - // note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF - // decomposition in partitionScalars - nbChunks := (fr.Bits + 1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } - return nbChunks + return (fr.Bits + c - 1) / c } // return the last window size for a scalar; if c divides the scalar size // then it returns c // if not, returns lastC << c func lastC(c uint64) uint64 { - const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition - if n%c == 0 { - return c + nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits + if nbAvailableBits == 0 { + // we can push a bit the edge case here; + // if the c-msb bits of modulus are not all ones, we have space for the carry + // (assuming inputs are smaller than modulus) + const qMsb16 = 0b1100101101101111 + msbC := qMsb16 >> (16 - c) + if !(msbC&((1<= max { + if digit > max { digit -= (1 << c) carry = 1 } @@ -679,6 +677,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } digits[int(chunk)*len(scalars)+i] = bits } + + // for the last chunk, we don't want to borrow from a next window + // (but may have a larger max value) + chunk := nbChunks - 1 + s := selectors[chunk] + // init with carry if any + digit := carry + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1 } }, nbTasks) diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go index 5db6512603..40a45408fe 100644 --- a/ecc/bls24-315/multiexp_affine.go +++ b/ecc/bls24-315/multiexp_affine.go @@ -653,7 +653,6 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine -type bitSetC1 [1]bool type bitSetC2 [2]bool type bitSetC4 [8]bool type bitSetC5 [16]bool @@ -670,8 +669,7 @@ type bitSetC15 [16384]bool type bitSetC16 [32768]bool type bitSet interface { - bitSetC1 | - bitSetC2 | + bitSetC2 | bitSetC4 | bitSetC5 | bitSetC6 | diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go index 0cd3432dad..be0bb121b4 100644 --- a/ecc/bls24-315/multiexp_jacobian.go +++ b/ecc/bls24-315/multiexp_jacobian.go @@ -61,7 +61,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg1JacExtendedC1 [1]g1JacExtended type bucketg1JacExtendedC2 [2]g1JacExtended type bucketg1JacExtendedC4 [8]g1JacExtended type bucketg1JacExtendedC5 [16]g1JacExtended @@ -78,8 +77,7 @@ type bucketg1JacExtendedC15 [16384]g1JacExtended type bucketg1JacExtendedC16 [32768]g1JacExtended type ibg1JacExtended interface { - bucketg1JacExtendedC1 | - bucketg1JacExtendedC2 | + bucketg1JacExtendedC2 | bucketg1JacExtendedC4 | bucketg1JacExtendedC5 | bucketg1JacExtendedC6 | @@ -140,7 +138,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg2JacExtendedC1 [1]g2JacExtended type bucketg2JacExtendedC2 [2]g2JacExtended type bucketg2JacExtendedC4 [8]g2JacExtended type bucketg2JacExtendedC5 [16]g2JacExtended @@ -157,8 +154,7 @@ type bucketg2JacExtendedC15 [16384]g2JacExtended type bucketg2JacExtendedC16 [32768]g2JacExtended type ibg2JacExtended interface { - bucketg2JacExtendedC1 | - bucketg2JacExtendedC2 | + bucketg2JacExtendedC2 | bucketg2JacExtendedC4 | bucketg2JacExtendedC5 | bucketg2JacExtendedC6 | diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index 733358396c..f5f31de3d4 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -183,8 +183,6 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { - case 1: - return processChunkG1Jacobian[bucketg1JacExtendedC1] case 3: return processChunkG1Jacobian[bucketg1JacExtendedC3] case 4: @@ -444,8 +442,6 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { - case 1: - return processChunkG2Jacobian[bucketg2JacExtendedC1] case 3: return processChunkG2Jacobian[bucketg2JacExtendedC3] case 4: @@ -560,25 +556,27 @@ type selector struct { } // return number of chunks for a given window size c +// the last chunk may be bigger to accomodate a potential carry from the NAF decomposition func computeNbChunks(c uint64) uint64 { - // note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF - // decomposition in partitionScalars - nbChunks := (fr.Bits + 1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } - return nbChunks + return (fr.Bits + c - 1) / c } // return the last window size for a scalar; if c divides the scalar size // then it returns c // if not, returns lastC << c func lastC(c uint64) uint64 { - const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition - if n%c == 0 { - return c + nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits + if nbAvailableBits == 0 { + // we can push a bit the edge case here; + // if the c-msb bits of modulus are not all ones, we have space for the carry + // (assuming inputs are smaller than modulus) + const qMsb16 = 0b1000100001111111 + msbC := qMsb16 >> (16 - c) + if !(msbC&((1<= max { + if digit > max { digit -= (1 << c) carry = 1 } @@ -679,6 +677,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } digits[int(chunk)*len(scalars)+i] = bits } + + // for the last chunk, we don't want to borrow from a next window + // (but may have a larger max value) + chunk := nbChunks - 1 + s := selectors[chunk] + // init with carry if any + digit := carry + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1 } }, nbTasks) diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go index 37050a251b..803835d815 100644 --- a/ecc/bls24-317/multiexp_affine.go +++ b/ecc/bls24-317/multiexp_affine.go @@ -653,7 +653,6 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine -type bitSetC1 [1]bool type bitSetC3 [4]bool type bitSetC4 [8]bool type bitSetC5 [16]bool @@ -670,8 +669,7 @@ type bitSetC15 [16384]bool type bitSetC16 [32768]bool type bitSet interface { - bitSetC1 | - bitSetC3 | + bitSetC3 | bitSetC4 | bitSetC5 | bitSetC6 | diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go index 6ecfd659e9..15fbf46f0e 100644 --- a/ecc/bls24-317/multiexp_jacobian.go +++ b/ecc/bls24-317/multiexp_jacobian.go @@ -61,7 +61,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg1JacExtendedC1 [1]g1JacExtended type bucketg1JacExtendedC3 [4]g1JacExtended type bucketg1JacExtendedC4 [8]g1JacExtended type bucketg1JacExtendedC5 [16]g1JacExtended @@ -78,8 +77,7 @@ type bucketg1JacExtendedC15 [16384]g1JacExtended type bucketg1JacExtendedC16 [32768]g1JacExtended type ibg1JacExtended interface { - bucketg1JacExtendedC1 | - bucketg1JacExtendedC3 | + bucketg1JacExtendedC3 | bucketg1JacExtendedC4 | bucketg1JacExtendedC5 | bucketg1JacExtendedC6 | @@ -140,7 +138,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg2JacExtendedC1 [1]g2JacExtended type bucketg2JacExtendedC3 [4]g2JacExtended type bucketg2JacExtendedC4 [8]g2JacExtended type bucketg2JacExtendedC5 [16]g2JacExtended @@ -157,8 +154,7 @@ type bucketg2JacExtendedC15 [16384]g2JacExtended type bucketg2JacExtendedC16 [32768]g2JacExtended type ibg2JacExtended interface { - bucketg2JacExtendedC1 | - bucketg2JacExtendedC3 | + bucketg2JacExtendedC3 | bucketg2JacExtendedC4 | bucketg2JacExtendedC5 | bucketg2JacExtendedC6 | diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index d373d1683e..88a5e5a1e2 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -560,25 +560,27 @@ type selector struct { } // return number of chunks for a given window size c +// the last chunk may be bigger to accomodate a potential carry from the NAF decomposition func computeNbChunks(c uint64) uint64 { - // note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF - // decomposition in partitionScalars - nbChunks := (fr.Bits + 1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } - return nbChunks + return (fr.Bits + c - 1) / c } // return the last window size for a scalar; if c divides the scalar size // then it returns c // if not, returns lastC << c func lastC(c uint64) uint64 { - const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition - if n%c == 0 { - return c + nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits + if nbAvailableBits == 0 { + // we can push a bit the edge case here; + // if the c-msb bits of modulus are not all ones, we have space for the carry + // (assuming inputs are smaller than modulus) + const qMsb16 = 0b1100000110010001 + msbC := qMsb16 >> (16 - c) + if !(msbC&((1<= max { + if digit > max { digit -= (1 << c) carry = 1 } @@ -679,6 +681,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } digits[int(chunk)*len(scalars)+i] = bits } + + // for the last chunk, we don't want to borrow from a next window + // (but may have a larger max value) + chunk := nbChunks - 1 + s := selectors[chunk] + // init with carry if any + digit := carry + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1 } }, nbTasks) diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index dbbd344b08..e78cb54082 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -183,8 +183,6 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) { switch c { - case 1: - return processChunkG1Jacobian[bucketg1JacExtendedC1] case 4: return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: @@ -391,8 +389,6 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) { switch c { - case 1: - return processChunkG2Jacobian[bucketg2JacExtendedC1] case 4: return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: @@ -454,25 +450,27 @@ type selector struct { } // return number of chunks for a given window size c +// the last chunk may be bigger to accomodate a potential carry from the NAF decomposition func computeNbChunks(c uint64) uint64 { - // note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF - // decomposition in partitionScalars - nbChunks := (fr.Bits + 1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } - return nbChunks + return (fr.Bits + c - 1) / c } // return the last window size for a scalar; if c divides the scalar size // then it returns c // if not, returns lastC << c func lastC(c uint64) uint64 { - const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition - if n%c == 0 { - return c + nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits + if nbAvailableBits == 0 { + // we can push a bit the edge case here; + // if the c-msb bits of modulus are not all ones, we have space for the carry + // (assuming inputs are smaller than modulus) + const qMsb16 = 0b1001100001000111 + msbC := qMsb16 >> (16 - c) + if !(msbC&((1<= max { + if digit > max { digit -= (1 << c) carry = 1 } @@ -573,6 +571,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } digits[int(chunk)*len(scalars)+i] = bits } + + // for the last chunk, we don't want to borrow from a next window + // (but may have a larger max value) + chunk := nbChunks - 1 + s := selectors[chunk] + // init with carry if any + digit := carry + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1 } }, nbTasks) diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go index f7189e2c35..fc4e13a30f 100644 --- a/ecc/bw6-633/multiexp_affine.go +++ b/ecc/bw6-633/multiexp_affine.go @@ -532,7 +532,6 @@ type pG2AffineC16 [640]G2Affine type ppG2AffineC16 [640]*G2Affine type qG2AffineC16 [640]batchOpG2Affine -type bitSetC1 [1]bool type bitSetC4 [8]bool type bitSetC5 [16]bool type bitSetC8 [128]bool @@ -540,8 +539,7 @@ type bitSetC12 [2048]bool type bitSetC16 [32768]bool type bitSet interface { - bitSetC1 | - bitSetC4 | + bitSetC4 | bitSetC5 | bitSetC8 | bitSetC12 | diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go index eb4a8a2a02..7e44a83aab 100644 --- a/ecc/bw6-633/multiexp_jacobian.go +++ b/ecc/bw6-633/multiexp_jacobian.go @@ -61,7 +61,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg1JacExtendedC1 [1]g1JacExtended type bucketg1JacExtendedC4 [8]g1JacExtended type bucketg1JacExtendedC5 [16]g1JacExtended type bucketg1JacExtendedC8 [128]g1JacExtended @@ -69,8 +68,7 @@ type bucketg1JacExtendedC12 [2048]g1JacExtended type bucketg1JacExtendedC16 [32768]g1JacExtended type ibg1JacExtended interface { - bucketg1JacExtendedC1 | - bucketg1JacExtendedC4 | + bucketg1JacExtendedC4 | bucketg1JacExtendedC5 | bucketg1JacExtendedC8 | bucketg1JacExtendedC12 | @@ -122,7 +120,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64, // we declare the buckets as fixed-size array types // this allow us to allocate the buckets on the stack -type bucketg2JacExtendedC1 [1]g2JacExtended type bucketg2JacExtendedC4 [8]g2JacExtended type bucketg2JacExtendedC5 [16]g2JacExtended type bucketg2JacExtendedC8 [128]g2JacExtended @@ -130,8 +127,7 @@ type bucketg2JacExtendedC12 [2048]g2JacExtended type bucketg2JacExtendedC16 [32768]g2JacExtended type ibg2JacExtended interface { - bucketg2JacExtendedC1 | - bucketg2JacExtendedC4 | + bucketg2JacExtendedC4 | bucketg2JacExtendedC5 | bucketg2JacExtendedC8 | bucketg2JacExtendedC12 | diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index 83b43d9a33..55da8d2db1 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -454,25 +454,27 @@ type selector struct { } // return number of chunks for a given window size c +// the last chunk may be bigger to accomodate a potential carry from the NAF decomposition func computeNbChunks(c uint64) uint64 { - // note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF - // decomposition in partitionScalars - nbChunks := (fr.Bits + 1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } - return nbChunks + return (fr.Bits + c - 1) / c } // return the last window size for a scalar; if c divides the scalar size // then it returns c // if not, returns lastC << c func lastC(c uint64) uint64 { - const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition - if n%c == 0 { - return c + nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits + if nbAvailableBits == 0 { + // we can push a bit the edge case here; + // if the c-msb bits of modulus are not all ones, we have space for the carry + // (assuming inputs are smaller than modulus) + const qMsb16 = 0b1111101110101100 + msbC := qMsb16 >> (16 - c) + if !(msbC&((1<= max { + if digit > max { digit -= (1 << c) carry = 1 } @@ -573,6 +575,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } digits[int(chunk)*len(scalars)+i] = bits } + + // for the last chunk, we don't want to borrow from a next window + // (but may have a larger max value) + chunk := nbChunks - 1 + s := selectors[chunk] + // init with carry if any + digit := carry + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1 } }, nbTasks) diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index a0d2a19620..ba723c154a 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -458,25 +458,27 @@ type selector struct { } // return number of chunks for a given window size c +// the last chunk may be bigger to accomodate a potential carry from the NAF decomposition func computeNbChunks(c uint64) uint64 { - // note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF - // decomposition in partitionScalars - nbChunks := (fr.Bits + 1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } - return nbChunks + return (fr.Bits + c - 1) / c } // return the last window size for a scalar; if c divides the scalar size // then it returns c // if not, returns lastC << c func lastC(c uint64) uint64 { - const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition - if n%c == 0 { - return c + nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits + if nbAvailableBits == 0 { + // we can push a bit the edge case here; + // if the c-msb bits of modulus are not all ones, we have space for the carry + // (assuming inputs are smaller than modulus) + const qMsb16 = 0b1101011100011101 + msbC := qMsb16 >> (16 - c) + if !(msbC&((1<= max { + if digit > max { digit -= (1 << c) carry = 1 } @@ -577,6 +579,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } digits[int(chunk)*len(scalars)+i] = bits } + + // for the last chunk, we don't want to borrow from a next window + // (but may have a larger max value) + chunk := nbChunks - 1 + s := selectors[chunk] + // init with carry if any + digit := carry + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1 } }, nbTasks) diff --git a/internal/field/field.go b/internal/field/field.go index 714e09770b..34783417bf 100644 --- a/internal/field/field.go +++ b/internal/field/field.go @@ -18,13 +18,14 @@ package field import ( "errors" "fmt" - "github.com/consensys/bavard" - "github.com/consensys/gnark-crypto/internal/field/internal/addchain" "math" "math/big" "math/bits" "strconv" "strings" + + "github.com/consensys/bavard" + "github.com/consensys/gnark-crypto/internal/field/internal/addchain" ) var ( @@ -38,6 +39,7 @@ type FieldConfig struct { ModulusBig *big.Int Modulus string ModulusHex string + ModulusSixteenMSB uint64 // 16 most significant bits of the modulus, right-aligned. NbWords int NbBits int NbWordsLastIndex int @@ -96,6 +98,18 @@ func NewFieldConfig(packageName, elementName, modulus string, useAddChain bool) F.NbBits = bModulus.BitLen() F.NbWords = len(bModulus.Bits()) + // compute the 16 msb; + if F.NbBits <= 16 { + F.ModulusSixteenMSB = F.ModulusBig.Uint64() + } else { + msb := new(big.Int) + msb.Rsh(F.ModulusBig, uint(F.NbBits)-16) + if msb.BitLen() != 16 { + panic("sanity check.") + } + F.ModulusSixteenMSB = msb.Uint64() + } + F.NbWordsLastIndex = F.NbWords - 1 // set q from big int repr diff --git a/internal/generator/ecc/generate.go b/internal/generator/ecc/generate.go index 2dce39800b..8283f912fd 100644 --- a/internal/generator/ecc/generate.go +++ b/internal/generator/ecc/generate.go @@ -28,12 +28,23 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er funcs["last"] = func(x int, a interface{}) bool { return x == reflect.ValueOf(a).Len()-1 } + funcs["binary"] = func(x uint64) string { + return strings.TrimSpace(fmt.Sprintf("%b", x)) + } lastC := func(c int) int { - n := (conf.Fr.NbBits + 1) // +1 for the potential carry of the NAF decomposition - if n%c == 0 { - return c + nbChunks := (conf.Fr.NbBits + c - 1) / c + nbAvailableBits := (nbChunks * c) - conf.Fr.NbBits + if nbAvailableBits == 0 { + // we can push a bit the edge case here; + // if the c-msb bits of modulus are not all ones, we have space for the carry + // (assuming inputs are smaller than modulus) + msb16 := conf.Fr.ModulusSixteenMSB + msbC := msb16 >> (16 - c) + if !(msbC&((1< +1 for a potential carry propagation due to the NAF - // decomposition in partitionScalars - nbChunks := (fr.Bits+1) / c - if (fr.Bits+1)%c != 0 { - nbChunks++ - } - return nbChunks + return (fr.Bits+c-1) / c } // return the last window size for a scalar; if c divides the scalar size // then it returns c // if not, returns lastC << c func lastC(c uint64) uint64 { - const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition - if n%c == 0 { - return c + nbAvailableBits := (computeNbChunks(c)*c) - fr.Bits + if nbAvailableBits == 0 { + // we can push a bit the edge case here; + // if the c-msb bits of modulus are not all ones, we have space for the carry + // (assuming inputs are smaller than modulus) + const qMsb16 = 0b{{binary .Fr.ModulusSixteenMSB}} + msbC := qMsb16 >> (16 - c) + if !(msbC&((1<= max { + if digit > max { digit -= (1 << c) carry = 1 } @@ -157,6 +159,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } digits[int(chunk)*len(scalars)+i] = bits } + + // for the last chunk, we don't want to borrow from a next window + // (but may have a larger max value) + chunk := nbChunks - 1 + s := selectors[chunk] + // init with carry if any + digit := carry + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1] & s.maskHigh) << s.shiftHigh + } + digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1 } }, nbTasks) From e3b29f7537f89c7369b5848adbf41010672f7f85 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Thu, 17 Nov 2022 13:26:29 -0600 Subject: [PATCH 41/43] fix: add panic in generator when c > 16 --- ecc/bls12-377/multiexp.go | 18 ++++--------- ecc/bls12-378/multiexp.go | 18 ++++--------- ecc/bls12-381/multiexp.go | 18 ++++--------- ecc/bls24-315/multiexp.go | 18 ++++--------- ecc/bls24-317/multiexp.go | 18 ++++--------- ecc/bn254/multiexp.go | 18 ++++--------- ecc/bw6-633/multiexp.go | 26 ++++++++----------- ecc/bw6-633/multiexp_affine.go | 2 ++ ecc/bw6-633/multiexp_jacobian.go | 4 +++ ecc/bw6-633/multiexp_test.go | 4 +-- ecc/bw6-756/multiexp.go | 18 ++++--------- ecc/bw6-761/multiexp.go | 18 ++++--------- internal/field/field.go | 13 ---------- internal/generator/ecc/generate.go | 23 +++++++--------- .../generator/ecc/template/multiexp.go.tmpl | 18 ++++--------- 15 files changed, 74 insertions(+), 160 deletions(-) diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index c81bd397ad..7e05079bb4 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -561,21 +561,13 @@ func computeNbChunks(c uint64) uint64 { return (fr.Bits + c - 1) / c } -// return the last window size for a scalar; if c divides the scalar size -// then it returns c -// if not, returns lastC << c +// return the last window size for a scalar; +// this last window should accomodate a carry (from the NAF decomposition) +// it can be == c if we have 1 available bit +// it can be > c if we have 0 available bit +// it can be < c if we have 2+ available bits func lastC(c uint64) uint64 { nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits - if nbAvailableBits == 0 { - // we can push a bit the edge case here; - // if the c-msb bits of modulus are not all ones, we have space for the carry - // (assuming inputs are smaller than modulus) - const qMsb16 = 0b1001010101011011 - msbC := qMsb16 >> (16 - c) - if !(msbC&((1< c if we have 0 available bit +// it can be < c if we have 2+ available bits func lastC(c uint64) uint64 { nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits - if nbAvailableBits == 0 { - // we can push a bit the edge case here; - // if the c-msb bits of modulus are not all ones, we have space for the carry - // (assuming inputs are smaller than modulus) - const qMsb16 = 0b1000001110011110 - msbC := qMsb16 >> (16 - c) - if !(msbC&((1< c if we have 0 available bit +// it can be < c if we have 2+ available bits func lastC(c uint64) uint64 { nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits - if nbAvailableBits == 0 { - // we can push a bit the edge case here; - // if the c-msb bits of modulus are not all ones, we have space for the carry - // (assuming inputs are smaller than modulus) - const qMsb16 = 0b1110011111011011 - msbC := qMsb16 >> (16 - c) - if !(msbC&((1< c if we have 0 available bit +// it can be < c if we have 2+ available bits func lastC(c uint64) uint64 { nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits - if nbAvailableBits == 0 { - // we can push a bit the edge case here; - // if the c-msb bits of modulus are not all ones, we have space for the carry - // (assuming inputs are smaller than modulus) - const qMsb16 = 0b1100101101101111 - msbC := qMsb16 >> (16 - c) - if !(msbC&((1< c if we have 0 available bit +// it can be < c if we have 2+ available bits func lastC(c uint64) uint64 { nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits - if nbAvailableBits == 0 { - // we can push a bit the edge case here; - // if the c-msb bits of modulus are not all ones, we have space for the carry - // (assuming inputs are smaller than modulus) - const qMsb16 = 0b1000100001111111 - msbC := qMsb16 >> (16 - c) - if !(msbC&((1< c if we have 0 available bit +// it can be < c if we have 2+ available bits func lastC(c uint64) uint64 { nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits - if nbAvailableBits == 0 { - // we can push a bit the edge case here; - // if the c-msb bits of modulus are not all ones, we have space for the carry - // (assuming inputs are smaller than modulus) - const qMsb16 = 0b1100000110010001 - msbC := qMsb16 >> (16 - c) - if !(msbC&((1<= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 12, 16} + implementedCs := []uint64{4, 5, 6, 8, 12, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -187,6 +187,8 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch return processChunkG1Jacobian[bucketg1JacExtendedC4] case 5: return processChunkG1Jacobian[bucketg1JacExtendedC5] + case 6: + return processChunkG1Jacobian[bucketg1JacExtendedC6] case 8: return processChunkG1Jacobian[bucketg1JacExtendedC8] case 12: @@ -290,7 +292,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul // we split recursively until nbChunks(c) >= nbTasks, bestC := func(nbPoints int) uint64 { // implemented msmC methods (the c we use must be in this slice) - implementedCs := []uint64{4, 5, 8, 12, 16} + implementedCs := []uint64{4, 5, 6, 8, 12, 16} var C uint64 // approximate cost (in group operations) // cost = bits/c * (nbPoints + 2^{c}) @@ -393,6 +395,8 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch return processChunkG2Jacobian[bucketg2JacExtendedC4] case 5: return processChunkG2Jacobian[bucketg2JacExtendedC5] + case 6: + return processChunkG2Jacobian[bucketg2JacExtendedC6] case 8: return processChunkG2Jacobian[bucketg2JacExtendedC8] case 12: @@ -455,21 +459,13 @@ func computeNbChunks(c uint64) uint64 { return (fr.Bits + c - 1) / c } -// return the last window size for a scalar; if c divides the scalar size -// then it returns c -// if not, returns lastC << c +// return the last window size for a scalar; +// this last window should accomodate a carry (from the NAF decomposition) +// it can be == c if we have 1 available bit +// it can be > c if we have 0 available bit +// it can be < c if we have 2+ available bits func lastC(c uint64) uint64 { nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits - if nbAvailableBits == 0 { - // we can push a bit the edge case here; - // if the c-msb bits of modulus are not all ones, we have space for the carry - // (assuming inputs are smaller than modulus) - const qMsb16 = 0b1001100001000111 - msbC := qMsb16 >> (16 - c) - if !(msbC&((1< c if we have 0 available bit +// it can be < c if we have 2+ available bits func lastC(c uint64) uint64 { nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits - if nbAvailableBits == 0 { - // we can push a bit the edge case here; - // if the c-msb bits of modulus are not all ones, we have space for the carry - // (assuming inputs are smaller than modulus) - const qMsb16 = 0b1111101110101100 - msbC := qMsb16 >> (16 - c) - if !(msbC&((1< c if we have 0 available bit +// it can be < c if we have 2+ available bits func lastC(c uint64) uint64 { nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits - if nbAvailableBits == 0 { - // we can push a bit the edge case here; - // if the c-msb bits of modulus are not all ones, we have space for the carry - // (assuming inputs are smaller than modulus) - const qMsb16 = 0b1101011100011101 - msbC := qMsb16 >> (16 - c) - if !(msbC&((1< c if we have 0 available bit + // it can be < c if we have 2+ available bits lastC := func(c int) int { nbChunks := (conf.Fr.NbBits + c - 1) / c nbAvailableBits := (nbChunks * c) - conf.Fr.NbBits - if nbAvailableBits == 0 { - // we can push a bit the edge case here; - // if the c-msb bits of modulus are not all ones, we have space for the carry - // (assuming inputs are smaller than modulus) - msb16 := conf.Fr.ModulusSixteenMSB - msbC := msb16 >> (16 - c) - if !(msbC&((1< 16 { + panic("we have a problem since we are using uint16 to store digits") } - return c + 1 - nbAvailableBits + return lc } batchSize := func(c int) int { // nbBuckets := (1 << (c - 1)) diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index 19ee187a84..2f892e0c99 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -38,21 +38,13 @@ func computeNbChunks(c uint64) uint64 { return (fr.Bits+c-1) / c } -// return the last window size for a scalar; if c divides the scalar size -// then it returns c -// if not, returns lastC << c +// return the last window size for a scalar; +// this last window should accomodate a carry (from the NAF decomposition) +// it can be == c if we have 1 available bit +// it can be > c if we have 0 available bit +// it can be < c if we have 2+ available bits func lastC(c uint64) uint64 { nbAvailableBits := (computeNbChunks(c)*c) - fr.Bits - if nbAvailableBits == 0 { - // we can push a bit the edge case here; - // if the c-msb bits of modulus are not all ones, we have space for the carry - // (assuming inputs are smaller than modulus) - const qMsb16 = 0b{{binary .Fr.ModulusSixteenMSB}} - msbC := qMsb16 >> (16 - c) - if !(msbC&((1< Date: Mon, 21 Nov 2022 18:22:29 +0100 Subject: [PATCH 42/43] perf: remove 3 muls by 1 in batchAdd --- ecc/bls12-377/g1.go | 8 +++++--- ecc/bls12-377/g2.go | 8 +++++--- ecc/bls12-378/g1.go | 8 +++++--- ecc/bls12-378/g2.go | 8 +++++--- ecc/bls12-381/g1.go | 8 +++++--- ecc/bls12-381/g2.go | 8 +++++--- ecc/bls24-315/g1.go | 8 +++++--- ecc/bls24-315/g2.go | 8 +++++--- ecc/bls24-317/g1.go | 8 +++++--- ecc/bls24-317/g2.go | 8 +++++--- ecc/bn254/g1.go | 8 +++++--- ecc/bn254/g2.go | 8 +++++--- ecc/bw6-633/g1.go | 8 +++++--- ecc/bw6-633/g2.go | 8 +++++--- ecc/bw6-756/g1.go | 8 +++++--- ecc/bw6-756/g2.go | 8 +++++--- ecc/bw6-761/g1.go | 8 +++++--- ecc/bw6-761/g2.go | 8 +++++--- internal/generator/ecc/template/point.go.tmpl | 15 ++++++++------- 19 files changed, 98 insertions(+), 61 deletions(-) diff --git a/ecc/bls12-377/g1.go b/ecc/bls12-377/g1.go index 910dd07b5e..5a6f659d36 100644 --- a/ecc/bls12-377/g1.go +++ b/ecc/bls12-377/g1.go @@ -994,19 +994,21 @@ func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, // invert denominator using montgomery batch invert technique { var accumulator fp.Element - accumulator.SetOne() + lambda[0].SetOne() + accumulator.Set(&lambdain[0]) - for i := 0; i < batchSize; i++ { + for i := 1; i < batchSize; i++ { lambda[i] = accumulator accumulator.Mul(&accumulator, &lambdain[i]) } accumulator.Inverse(&accumulator) - for i := batchSize - 1; i >= 0; i-- { + for i := batchSize - 1; i > 0; i-- { lambda[i].Mul(&lambda[i], &accumulator) accumulator.Mul(&accumulator, &lambdain[i]) } + lambda[0].Set(&accumulator) } var d fp.Element diff --git a/ecc/bls12-377/g2.go b/ecc/bls12-377/g2.go index 0fe9a4119c..011e336c8b 100644 --- a/ecc/bls12-377/g2.go +++ b/ecc/bls12-377/g2.go @@ -990,19 +990,21 @@ func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, // invert denominator using montgomery batch invert technique { var accumulator fptower.E2 - accumulator.SetOne() + lambda[0].SetOne() + accumulator.Set(&lambdain[0]) - for i := 0; i < batchSize; i++ { + for i := 1; i < batchSize; i++ { lambda[i] = accumulator accumulator.Mul(&accumulator, &lambdain[i]) } accumulator.Inverse(&accumulator) - for i := batchSize - 1; i >= 0; i-- { + for i := batchSize - 1; i > 0; i-- { lambda[i].Mul(&lambda[i], &accumulator) accumulator.Mul(&accumulator, &lambdain[i]) } + lambda[0].Set(&accumulator) } var d fptower.E2 diff --git a/ecc/bls12-378/g1.go b/ecc/bls12-378/g1.go index 67d64790d7..a409696a97 100644 --- a/ecc/bls12-378/g1.go +++ b/ecc/bls12-378/g1.go @@ -994,19 +994,21 @@ func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, // invert denominator using montgomery batch invert technique { var accumulator fp.Element - accumulator.SetOne() + lambda[0].SetOne() + accumulator.Set(&lambdain[0]) - for i := 0; i < batchSize; i++ { + for i := 1; i < batchSize; i++ { lambda[i] = accumulator accumulator.Mul(&accumulator, &lambdain[i]) } accumulator.Inverse(&accumulator) - for i := batchSize - 1; i >= 0; i-- { + for i := batchSize - 1; i > 0; i-- { lambda[i].Mul(&lambda[i], &accumulator) accumulator.Mul(&accumulator, &lambdain[i]) } + lambda[0].Set(&accumulator) } var d fp.Element diff --git a/ecc/bls12-378/g2.go b/ecc/bls12-378/g2.go index 905e2ba893..eb26579bba 100644 --- a/ecc/bls12-378/g2.go +++ b/ecc/bls12-378/g2.go @@ -990,19 +990,21 @@ func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, // invert denominator using montgomery batch invert technique { var accumulator fptower.E2 - accumulator.SetOne() + lambda[0].SetOne() + accumulator.Set(&lambdain[0]) - for i := 0; i < batchSize; i++ { + for i := 1; i < batchSize; i++ { lambda[i] = accumulator accumulator.Mul(&accumulator, &lambdain[i]) } accumulator.Inverse(&accumulator) - for i := batchSize - 1; i >= 0; i-- { + for i := batchSize - 1; i > 0; i-- { lambda[i].Mul(&lambda[i], &accumulator) accumulator.Mul(&accumulator, &lambdain[i]) } + lambda[0].Set(&accumulator) } var d fptower.E2 diff --git a/ecc/bls12-381/g1.go b/ecc/bls12-381/g1.go index 474c868025..37c71feaae 100644 --- a/ecc/bls12-381/g1.go +++ b/ecc/bls12-381/g1.go @@ -994,19 +994,21 @@ func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, // invert denominator using montgomery batch invert technique { var accumulator fp.Element - accumulator.SetOne() + lambda[0].SetOne() + accumulator.Set(&lambdain[0]) - for i := 0; i < batchSize; i++ { + for i := 1; i < batchSize; i++ { lambda[i] = accumulator accumulator.Mul(&accumulator, &lambdain[i]) } accumulator.Inverse(&accumulator) - for i := batchSize - 1; i >= 0; i-- { + for i := batchSize - 1; i > 0; i-- { lambda[i].Mul(&lambda[i], &accumulator) accumulator.Mul(&accumulator, &lambdain[i]) } + lambda[0].Set(&accumulator) } var d fp.Element diff --git a/ecc/bls12-381/g2.go b/ecc/bls12-381/g2.go index a8575f59f7..d65c640328 100644 --- a/ecc/bls12-381/g2.go +++ b/ecc/bls12-381/g2.go @@ -991,19 +991,21 @@ func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, // invert denominator using montgomery batch invert technique { var accumulator fptower.E2 - accumulator.SetOne() + lambda[0].SetOne() + accumulator.Set(&lambdain[0]) - for i := 0; i < batchSize; i++ { + for i := 1; i < batchSize; i++ { lambda[i] = accumulator accumulator.Mul(&accumulator, &lambdain[i]) } accumulator.Inverse(&accumulator) - for i := batchSize - 1; i >= 0; i-- { + for i := batchSize - 1; i > 0; i-- { lambda[i].Mul(&lambda[i], &accumulator) accumulator.Mul(&accumulator, &lambdain[i]) } + lambda[0].Set(&accumulator) } var d fptower.E2 diff --git a/ecc/bls24-315/g1.go b/ecc/bls24-315/g1.go index bde8a50d43..cb66154c0f 100644 --- a/ecc/bls24-315/g1.go +++ b/ecc/bls24-315/g1.go @@ -996,19 +996,21 @@ func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, // invert denominator using montgomery batch invert technique { var accumulator fp.Element - accumulator.SetOne() + lambda[0].SetOne() + accumulator.Set(&lambdain[0]) - for i := 0; i < batchSize; i++ { + for i := 1; i < batchSize; i++ { lambda[i] = accumulator accumulator.Mul(&accumulator, &lambdain[i]) } accumulator.Inverse(&accumulator) - for i := batchSize - 1; i >= 0; i-- { + for i := batchSize - 1; i > 0; i-- { lambda[i].Mul(&lambda[i], &accumulator) accumulator.Mul(&accumulator, &lambdain[i]) } + lambda[0].Set(&accumulator) } var d fp.Element diff --git a/ecc/bls24-315/g2.go b/ecc/bls24-315/g2.go index 662bfe0313..8ec580405e 100644 --- a/ecc/bls24-315/g2.go +++ b/ecc/bls24-315/g2.go @@ -1006,19 +1006,21 @@ func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, // invert denominator using montgomery batch invert technique { var accumulator fptower.E4 - accumulator.SetOne() + lambda[0].SetOne() + accumulator.Set(&lambdain[0]) - for i := 0; i < batchSize; i++ { + for i := 1; i < batchSize; i++ { lambda[i] = accumulator accumulator.Mul(&accumulator, &lambdain[i]) } accumulator.Inverse(&accumulator) - for i := batchSize - 1; i >= 0; i-- { + for i := batchSize - 1; i > 0; i-- { lambda[i].Mul(&lambda[i], &accumulator) accumulator.Mul(&accumulator, &lambdain[i]) } + lambda[0].Set(&accumulator) } var d fptower.E4 diff --git a/ecc/bls24-317/g1.go b/ecc/bls24-317/g1.go index cd9452b1ce..50fdef55c9 100644 --- a/ecc/bls24-317/g1.go +++ b/ecc/bls24-317/g1.go @@ -996,19 +996,21 @@ func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, // invert denominator using montgomery batch invert technique { var accumulator fp.Element - accumulator.SetOne() + lambda[0].SetOne() + accumulator.Set(&lambdain[0]) - for i := 0; i < batchSize; i++ { + for i := 1; i < batchSize; i++ { lambda[i] = accumulator accumulator.Mul(&accumulator, &lambdain[i]) } accumulator.Inverse(&accumulator) - for i := batchSize - 1; i >= 0; i-- { + for i := batchSize - 1; i > 0; i-- { lambda[i].Mul(&lambda[i], &accumulator) accumulator.Mul(&accumulator, &lambdain[i]) } + lambda[0].Set(&accumulator) } var d fp.Element diff --git a/ecc/bls24-317/g2.go b/ecc/bls24-317/g2.go index 96d823eaf9..72a3631d39 100644 --- a/ecc/bls24-317/g2.go +++ b/ecc/bls24-317/g2.go @@ -1006,19 +1006,21 @@ func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, // invert denominator using montgomery batch invert technique { var accumulator fptower.E4 - accumulator.SetOne() + lambda[0].SetOne() + accumulator.Set(&lambdain[0]) - for i := 0; i < batchSize; i++ { + for i := 1; i < batchSize; i++ { lambda[i] = accumulator accumulator.Mul(&accumulator, &lambdain[i]) } accumulator.Inverse(&accumulator) - for i := batchSize - 1; i >= 0; i-- { + for i := batchSize - 1; i > 0; i-- { lambda[i].Mul(&lambda[i], &accumulator) accumulator.Mul(&accumulator, &lambdain[i]) } + lambda[0].Set(&accumulator) } var d fptower.E4 diff --git a/ecc/bn254/g1.go b/ecc/bn254/g1.go index 5bad4b316c..fe0af18997 100644 --- a/ecc/bn254/g1.go +++ b/ecc/bn254/g1.go @@ -966,19 +966,21 @@ func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, // invert denominator using montgomery batch invert technique { var accumulator fp.Element - accumulator.SetOne() + lambda[0].SetOne() + accumulator.Set(&lambdain[0]) - for i := 0; i < batchSize; i++ { + for i := 1; i < batchSize; i++ { lambda[i] = accumulator accumulator.Mul(&accumulator, &lambdain[i]) } accumulator.Inverse(&accumulator) - for i := batchSize - 1; i >= 0; i-- { + for i := batchSize - 1; i > 0; i-- { lambda[i].Mul(&lambda[i], &accumulator) accumulator.Mul(&accumulator, &lambdain[i]) } + lambda[0].Set(&accumulator) } var d fp.Element diff --git a/ecc/bn254/g2.go b/ecc/bn254/g2.go index 09011b0c53..c4f6caaf9f 100644 --- a/ecc/bn254/g2.go +++ b/ecc/bn254/g2.go @@ -995,19 +995,21 @@ func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, // invert denominator using montgomery batch invert technique { var accumulator fptower.E2 - accumulator.SetOne() + lambda[0].SetOne() + accumulator.Set(&lambdain[0]) - for i := 0; i < batchSize; i++ { + for i := 1; i < batchSize; i++ { lambda[i] = accumulator accumulator.Mul(&accumulator, &lambdain[i]) } accumulator.Inverse(&accumulator) - for i := batchSize - 1; i >= 0; i-- { + for i := batchSize - 1; i > 0; i-- { lambda[i].Mul(&lambda[i], &accumulator) accumulator.Mul(&accumulator, &lambdain[i]) } + lambda[0].Set(&accumulator) } var d fptower.E2 diff --git a/ecc/bw6-633/g1.go b/ecc/bw6-633/g1.go index dc2289ac76..ed35639923 100644 --- a/ecc/bw6-633/g1.go +++ b/ecc/bw6-633/g1.go @@ -1098,19 +1098,21 @@ func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, // invert denominator using montgomery batch invert technique { var accumulator fp.Element - accumulator.SetOne() + lambda[0].SetOne() + accumulator.Set(&lambdain[0]) - for i := 0; i < batchSize; i++ { + for i := 1; i < batchSize; i++ { lambda[i] = accumulator accumulator.Mul(&accumulator, &lambdain[i]) } accumulator.Inverse(&accumulator) - for i := batchSize - 1; i >= 0; i-- { + for i := batchSize - 1; i > 0; i-- { lambda[i].Mul(&lambda[i], &accumulator) accumulator.Mul(&accumulator, &lambdain[i]) } + lambda[0].Set(&accumulator) } var d fp.Element diff --git a/ecc/bw6-633/g2.go b/ecc/bw6-633/g2.go index 3d27026424..a045759de4 100644 --- a/ecc/bw6-633/g2.go +++ b/ecc/bw6-633/g2.go @@ -961,19 +961,21 @@ func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, // invert denominator using montgomery batch invert technique { var accumulator fp.Element - accumulator.SetOne() + lambda[0].SetOne() + accumulator.Set(&lambdain[0]) - for i := 0; i < batchSize; i++ { + for i := 1; i < batchSize; i++ { lambda[i] = accumulator accumulator.Mul(&accumulator, &lambdain[i]) } accumulator.Inverse(&accumulator) - for i := batchSize - 1; i >= 0; i-- { + for i := batchSize - 1; i > 0; i-- { lambda[i].Mul(&lambda[i], &accumulator) accumulator.Mul(&accumulator, &lambdain[i]) } + lambda[0].Set(&accumulator) } var d fp.Element diff --git a/ecc/bw6-756/g1.go b/ecc/bw6-756/g1.go index 5cbf001665..d69651047a 100644 --- a/ecc/bw6-756/g1.go +++ b/ecc/bw6-756/g1.go @@ -1098,19 +1098,21 @@ func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, // invert denominator using montgomery batch invert technique { var accumulator fp.Element - accumulator.SetOne() + lambda[0].SetOne() + accumulator.Set(&lambdain[0]) - for i := 0; i < batchSize; i++ { + for i := 1; i < batchSize; i++ { lambda[i] = accumulator accumulator.Mul(&accumulator, &lambdain[i]) } accumulator.Inverse(&accumulator) - for i := batchSize - 1; i >= 0; i-- { + for i := batchSize - 1; i > 0; i-- { lambda[i].Mul(&lambda[i], &accumulator) accumulator.Mul(&accumulator, &lambdain[i]) } + lambda[0].Set(&accumulator) } var d fp.Element diff --git a/ecc/bw6-756/g2.go b/ecc/bw6-756/g2.go index e8b048fb9b..822d56145a 100644 --- a/ecc/bw6-756/g2.go +++ b/ecc/bw6-756/g2.go @@ -955,19 +955,21 @@ func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, // invert denominator using montgomery batch invert technique { var accumulator fp.Element - accumulator.SetOne() + lambda[0].SetOne() + accumulator.Set(&lambdain[0]) - for i := 0; i < batchSize; i++ { + for i := 1; i < batchSize; i++ { lambda[i] = accumulator accumulator.Mul(&accumulator, &lambdain[i]) } accumulator.Inverse(&accumulator) - for i := batchSize - 1; i >= 0; i-- { + for i := batchSize - 1; i > 0; i-- { lambda[i].Mul(&lambda[i], &accumulator) accumulator.Mul(&accumulator, &lambdain[i]) } + lambda[0].Set(&accumulator) } var d fp.Element diff --git a/ecc/bw6-761/g1.go b/ecc/bw6-761/g1.go index d6de060519..9fee8ba8de 100644 --- a/ecc/bw6-761/g1.go +++ b/ecc/bw6-761/g1.go @@ -1109,19 +1109,21 @@ func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, // invert denominator using montgomery batch invert technique { var accumulator fp.Element - accumulator.SetOne() + lambda[0].SetOne() + accumulator.Set(&lambdain[0]) - for i := 0; i < batchSize; i++ { + for i := 1; i < batchSize; i++ { lambda[i] = accumulator accumulator.Mul(&accumulator, &lambdain[i]) } accumulator.Inverse(&accumulator) - for i := batchSize - 1; i >= 0; i-- { + for i := batchSize - 1; i > 0; i-- { lambda[i].Mul(&lambda[i], &accumulator) accumulator.Mul(&accumulator, &lambdain[i]) } + lambda[0].Set(&accumulator) } var d fp.Element diff --git a/ecc/bw6-761/g2.go b/ecc/bw6-761/g2.go index b1b8b664dd..174b33829a 100644 --- a/ecc/bw6-761/g2.go +++ b/ecc/bw6-761/g2.go @@ -969,19 +969,21 @@ func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, // invert denominator using montgomery batch invert technique { var accumulator fp.Element - accumulator.SetOne() + lambda[0].SetOne() + accumulator.Set(&lambdain[0]) - for i := 0; i < batchSize; i++ { + for i := 1; i < batchSize; i++ { lambda[i] = accumulator accumulator.Mul(&accumulator, &lambdain[i]) } accumulator.Inverse(&accumulator) - for i := batchSize - 1; i >= 0; i-- { + for i := batchSize - 1; i > 0; i-- { lambda[i].Mul(&lambda[i], &accumulator) accumulator.Mul(&accumulator, &lambdain[i]) } + lambda[0].Set(&accumulator) } var d fp.Element diff --git a/internal/generator/ecc/template/point.go.tmpl b/internal/generator/ecc/template/point.go.tmpl index c5455ff072..57e40eb2c4 100644 --- a/internal/generator/ecc/template/point.go.tmpl +++ b/internal/generator/ecc/template/point.go.tmpl @@ -1586,19 +1586,21 @@ func batchAdd{{ $TAffine }}[TP p{{ $TAffine }}, TPP pp{{ $TAffine }}, TC c{{ $TA // invert denominator using montgomery batch invert technique { var accumulator {{.CoordType}} - accumulator.SetOne() - - for i := 0; i < batchSize; i++ { + lambda[0].SetOne() + accumulator.Set(&lambdain[0]) + + for i := 1; i < batchSize; i++ { lambda[i] = accumulator accumulator.Mul(&accumulator, &lambdain[i]) } - + accumulator.Inverse(&accumulator) - - for i := batchSize - 1; i >= 0; i-- { + + for i := batchSize - 1; i > 0; i-- { lambda[i].Mul(&lambda[i], &accumulator) accumulator.Mul(&accumulator, &lambdain[i]) } + lambda[0].Set(&accumulator) } var d {{.CoordType}} @@ -1620,4 +1622,3 @@ func batchAdd{{ $TAffine }}[TP p{{ $TAffine }}, TPP pp{{ $TAffine }}, TC c{{ $TA (*R)[j].Set(&rr) } } - From 9673409560e5db16885180a462951a74e12a9335 Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Mon, 21 Nov 2022 19:57:02 +0100 Subject: [PATCH 43/43] docs: add comment regarding double(infinity) in ext-Jac --- ecc/bls12-377/g1.go | 2 + ecc/bls12-377/g2.go | 2 + ecc/bls12-377/multiexp.go | 4 -- ecc/bls12-378/g1.go | 2 + ecc/bls12-378/g2.go | 2 + ecc/bls12-378/multiexp.go | 4 -- ecc/bls12-381/g1.go | 2 + ecc/bls12-381/g2.go | 2 + ecc/bls12-381/multiexp.go | 4 -- ecc/bls24-315/g1.go | 2 + ecc/bls24-315/g2.go | 2 + ecc/bls24-315/multiexp.go | 4 -- ecc/bls24-317/g1.go | 2 + ecc/bls24-317/g2.go | 2 + ecc/bls24-317/multiexp.go | 4 -- ecc/bn254/g1.go | 2 + ecc/bn254/g2.go | 2 + ecc/bn254/multiexp.go | 4 -- ecc/bw6-633/g1.go | 2 + ecc/bw6-633/g2.go | 2 + ecc/bw6-633/multiexp.go | 4 -- ecc/bw6-756/g1.go | 2 + ecc/bw6-756/g2.go | 2 + ecc/bw6-756/multiexp.go | 4 -- ecc/bw6-761/g1.go | 2 + ecc/bw6-761/g2.go | 2 + ecc/bw6-761/multiexp.go | 4 -- .../generator/ecc/template/multiexp.go.tmpl | 56 +++++++++---------- internal/generator/ecc/template/point.go.tmpl | 2 + 29 files changed, 65 insertions(+), 65 deletions(-) diff --git a/ecc/bls12-377/g1.go b/ecc/bls12-377/g1.go index 5a6f659d36..d3c6e729da 100644 --- a/ecc/bls12-377/g1.go +++ b/ecc/bls12-377/g1.go @@ -635,6 +635,8 @@ func (p *g1JacExtended) add(q *g1JacExtended) *g1JacExtended { // double point in Jacobian extended coordinates // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 +// since we consider any point on Z=0 as the point at infinity +// this doubling formula works for infinity points as well func (p *g1JacExtended) double(q *g1JacExtended) *g1JacExtended { var U, V, W, S, XX, M fp.Element diff --git a/ecc/bls12-377/g2.go b/ecc/bls12-377/g2.go index 011e336c8b..51660e48a6 100644 --- a/ecc/bls12-377/g2.go +++ b/ecc/bls12-377/g2.go @@ -653,6 +653,8 @@ func (p *g2JacExtended) add(q *g2JacExtended) *g2JacExtended { // double point in Jacobian extended coordinates // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 +// since we consider any point on Z=0 as the point at infinity +// this doubling formula works for infinity points as well func (p *g2JacExtended) double(q *g2JacExtended) *g2JacExtended { var U, V, W, S, XX, M fptower.E2 diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go index 7e05079bb4..13c7dd331e 100644 --- a/ecc/bls12-377/multiexp.go +++ b/ecc/bls12-377/multiexp.go @@ -273,14 +273,12 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } @@ -532,14 +530,12 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/ecc/bls12-378/g1.go b/ecc/bls12-378/g1.go index a409696a97..eda699043c 100644 --- a/ecc/bls12-378/g1.go +++ b/ecc/bls12-378/g1.go @@ -635,6 +635,8 @@ func (p *g1JacExtended) add(q *g1JacExtended) *g1JacExtended { // double point in Jacobian extended coordinates // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 +// since we consider any point on Z=0 as the point at infinity +// this doubling formula works for infinity points as well func (p *g1JacExtended) double(q *g1JacExtended) *g1JacExtended { var U, V, W, S, XX, M fp.Element diff --git a/ecc/bls12-378/g2.go b/ecc/bls12-378/g2.go index eb26579bba..1c5c7a17b5 100644 --- a/ecc/bls12-378/g2.go +++ b/ecc/bls12-378/g2.go @@ -653,6 +653,8 @@ func (p *g2JacExtended) add(q *g2JacExtended) *g2JacExtended { // double point in Jacobian extended coordinates // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 +// since we consider any point on Z=0 as the point at infinity +// this doubling formula works for infinity points as well func (p *g2JacExtended) double(q *g2JacExtended) *g2JacExtended { var U, V, W, S, XX, M fptower.E2 diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go index 9e1685cca8..a52821b9f9 100644 --- a/ecc/bls12-378/multiexp.go +++ b/ecc/bls12-378/multiexp.go @@ -275,14 +275,12 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } @@ -536,14 +534,12 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/ecc/bls12-381/g1.go b/ecc/bls12-381/g1.go index 37c71feaae..0c71c6b17a 100644 --- a/ecc/bls12-381/g1.go +++ b/ecc/bls12-381/g1.go @@ -635,6 +635,8 @@ func (p *g1JacExtended) add(q *g1JacExtended) *g1JacExtended { // double point in Jacobian extended coordinates // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 +// since we consider any point on Z=0 as the point at infinity +// this doubling formula works for infinity points as well func (p *g1JacExtended) double(q *g1JacExtended) *g1JacExtended { var U, V, W, S, XX, M fp.Element diff --git a/ecc/bls12-381/g2.go b/ecc/bls12-381/g2.go index d65c640328..e999ff48af 100644 --- a/ecc/bls12-381/g2.go +++ b/ecc/bls12-381/g2.go @@ -654,6 +654,8 @@ func (p *g2JacExtended) add(q *g2JacExtended) *g2JacExtended { // double point in Jacobian extended coordinates // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 +// since we consider any point on Z=0 as the point at infinity +// this doubling formula works for infinity points as well func (p *g2JacExtended) double(q *g2JacExtended) *g2JacExtended { var U, V, W, S, XX, M fptower.E2 diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go index cdf5680ca4..2d83a2eca2 100644 --- a/ecc/bls12-381/multiexp.go +++ b/ecc/bls12-381/multiexp.go @@ -273,14 +273,12 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } @@ -532,14 +530,12 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/ecc/bls24-315/g1.go b/ecc/bls24-315/g1.go index cb66154c0f..7a47da919c 100644 --- a/ecc/bls24-315/g1.go +++ b/ecc/bls24-315/g1.go @@ -637,6 +637,8 @@ func (p *g1JacExtended) add(q *g1JacExtended) *g1JacExtended { // double point in Jacobian extended coordinates // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 +// since we consider any point on Z=0 as the point at infinity +// this doubling formula works for infinity points as well func (p *g1JacExtended) double(q *g1JacExtended) *g1JacExtended { var U, V, W, S, XX, M fp.Element diff --git a/ecc/bls24-315/g2.go b/ecc/bls24-315/g2.go index 8ec580405e..06d38bfc1d 100644 --- a/ecc/bls24-315/g2.go +++ b/ecc/bls24-315/g2.go @@ -669,6 +669,8 @@ func (p *g2JacExtended) add(q *g2JacExtended) *g2JacExtended { // double point in Jacobian extended coordinates // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 +// since we consider any point on Z=0 as the point at infinity +// this doubling formula works for infinity points as well func (p *g2JacExtended) double(q *g2JacExtended) *g2JacExtended { var U, V, W, S, XX, M fptower.E4 diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go index 4d546776e5..2b440ca494 100644 --- a/ecc/bls24-315/multiexp.go +++ b/ecc/bls24-315/multiexp.go @@ -273,14 +273,12 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } @@ -532,14 +530,12 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/ecc/bls24-317/g1.go b/ecc/bls24-317/g1.go index 50fdef55c9..3531d495cb 100644 --- a/ecc/bls24-317/g1.go +++ b/ecc/bls24-317/g1.go @@ -637,6 +637,8 @@ func (p *g1JacExtended) add(q *g1JacExtended) *g1JacExtended { // double point in Jacobian extended coordinates // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 +// since we consider any point on Z=0 as the point at infinity +// this doubling formula works for infinity points as well func (p *g1JacExtended) double(q *g1JacExtended) *g1JacExtended { var U, V, W, S, XX, M fp.Element diff --git a/ecc/bls24-317/g2.go b/ecc/bls24-317/g2.go index 72a3631d39..1bd9fa92c5 100644 --- a/ecc/bls24-317/g2.go +++ b/ecc/bls24-317/g2.go @@ -669,6 +669,8 @@ func (p *g2JacExtended) add(q *g2JacExtended) *g2JacExtended { // double point in Jacobian extended coordinates // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 +// since we consider any point on Z=0 as the point at infinity +// this doubling formula works for infinity points as well func (p *g2JacExtended) double(q *g2JacExtended) *g2JacExtended { var U, V, W, S, XX, M fptower.E4 diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go index c92af97379..19d613fb55 100644 --- a/ecc/bls24-317/multiexp.go +++ b/ecc/bls24-317/multiexp.go @@ -273,14 +273,12 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } @@ -532,14 +530,12 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/ecc/bn254/g1.go b/ecc/bn254/g1.go index fe0af18997..2682d9c6fe 100644 --- a/ecc/bn254/g1.go +++ b/ecc/bn254/g1.go @@ -607,6 +607,8 @@ func (p *g1JacExtended) add(q *g1JacExtended) *g1JacExtended { // double point in Jacobian extended coordinates // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 +// since we consider any point on Z=0 as the point at infinity +// this doubling formula works for infinity points as well func (p *g1JacExtended) double(q *g1JacExtended) *g1JacExtended { var U, V, W, S, XX, M fp.Element diff --git a/ecc/bn254/g2.go b/ecc/bn254/g2.go index c4f6caaf9f..8026f982db 100644 --- a/ecc/bn254/g2.go +++ b/ecc/bn254/g2.go @@ -658,6 +658,8 @@ func (p *g2JacExtended) add(q *g2JacExtended) *g2JacExtended { // double point in Jacobian extended coordinates // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 +// since we consider any point on Z=0 as the point at infinity +// this doubling formula works for infinity points as well func (p *g2JacExtended) double(q *g2JacExtended) *g2JacExtended { var U, V, W, S, XX, M fptower.E2 diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go index e7f8fc56bd..b70a8c4641 100644 --- a/ecc/bn254/multiexp.go +++ b/ecc/bn254/multiexp.go @@ -275,14 +275,12 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } @@ -536,14 +534,12 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/ecc/bw6-633/g1.go b/ecc/bw6-633/g1.go index ed35639923..78eeb6e8f8 100644 --- a/ecc/bw6-633/g1.go +++ b/ecc/bw6-633/g1.go @@ -664,6 +664,8 @@ func (p *g1JacExtended) add(q *g1JacExtended) *g1JacExtended { // double point in Jacobian extended coordinates // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 +// since we consider any point on Z=0 as the point at infinity +// this doubling formula works for infinity points as well func (p *g1JacExtended) double(q *g1JacExtended) *g1JacExtended { var U, V, W, S, XX, M fp.Element diff --git a/ecc/bw6-633/g2.go b/ecc/bw6-633/g2.go index a045759de4..a927f9f8a7 100644 --- a/ecc/bw6-633/g2.go +++ b/ecc/bw6-633/g2.go @@ -654,6 +654,8 @@ func (p *g2JacExtended) add(q *g2JacExtended) *g2JacExtended { // double point in Jacobian extended coordinates // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 +// since we consider any point on Z=0 as the point at infinity +// this doubling formula works for infinity points as well func (p *g2JacExtended) double(q *g2JacExtended) *g2JacExtended { var U, V, W, S, XX, M fp.Element diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go index 6dc4bbc779..c58ed424ab 100644 --- a/ecc/bw6-633/multiexp.go +++ b/ecc/bw6-633/multiexp.go @@ -222,14 +222,12 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } @@ -430,14 +428,12 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/ecc/bw6-756/g1.go b/ecc/bw6-756/g1.go index d69651047a..9d54281e86 100644 --- a/ecc/bw6-756/g1.go +++ b/ecc/bw6-756/g1.go @@ -664,6 +664,8 @@ func (p *g1JacExtended) add(q *g1JacExtended) *g1JacExtended { // double point in Jacobian extended coordinates // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 +// since we consider any point on Z=0 as the point at infinity +// this doubling formula works for infinity points as well func (p *g1JacExtended) double(q *g1JacExtended) *g1JacExtended { var U, V, W, S, XX, M fp.Element diff --git a/ecc/bw6-756/g2.go b/ecc/bw6-756/g2.go index 822d56145a..fe6189779b 100644 --- a/ecc/bw6-756/g2.go +++ b/ecc/bw6-756/g2.go @@ -648,6 +648,8 @@ func (p *g2JacExtended) add(q *g2JacExtended) *g2JacExtended { // double point in Jacobian extended coordinates // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 +// since we consider any point on Z=0 as the point at infinity +// this doubling formula works for infinity points as well func (p *g2JacExtended) double(q *g2JacExtended) *g2JacExtended { var U, V, W, S, XX, M fp.Element diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go index edcb56f599..d2ac51b7ca 100644 --- a/ecc/bw6-756/multiexp.go +++ b/ecc/bw6-756/multiexp.go @@ -222,14 +222,12 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } @@ -430,14 +428,12 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/ecc/bw6-761/g1.go b/ecc/bw6-761/g1.go index 9fee8ba8de..4f16b8f2a8 100644 --- a/ecc/bw6-761/g1.go +++ b/ecc/bw6-761/g1.go @@ -675,6 +675,8 @@ func (p *g1JacExtended) add(q *g1JacExtended) *g1JacExtended { // double point in Jacobian extended coordinates // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 +// since we consider any point on Z=0 as the point at infinity +// this doubling formula works for infinity points as well func (p *g1JacExtended) double(q *g1JacExtended) *g1JacExtended { var U, V, W, S, XX, M fp.Element diff --git a/ecc/bw6-761/g2.go b/ecc/bw6-761/g2.go index 174b33829a..dc0fc1483e 100644 --- a/ecc/bw6-761/g2.go +++ b/ecc/bw6-761/g2.go @@ -662,6 +662,8 @@ func (p *g2JacExtended) add(q *g2JacExtended) *g2JacExtended { // double point in Jacobian extended coordinates // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 +// since we consider any point on Z=0 as the point at infinity +// this doubling formula works for infinity points as well func (p *g2JacExtended) double(q *g2JacExtended) *g2JacExtended { var U, V, W, S, XX, M fp.Element diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go index 46e37958fa..c17eee4727 100644 --- a/ecc/bw6-761/multiexp.go +++ b/ecc/bw6-761/multiexp.go @@ -224,14 +224,12 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } @@ -434,14 +432,12 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl index 2f892e0c99..b6cc7aef5e 100644 --- a/internal/generator/ecc/template/multiexp.go.tmpl +++ b/internal/generator/ecc/template/multiexp.go.tmpl @@ -49,19 +49,19 @@ func lastC(c uint64) uint64 { } type chunkStat struct { - // relative weight of work compared to other chunks. 100.0 -> nominal weight. + // relative weight of work compared to other chunks. 100.0 -> nominal weight. weight float32 - // // average absolute deviation. this is meant to give a sense of statistical + // // average absolute deviation. this is meant to give a sense of statistical // // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets) - // deviation int + // deviation int - // percentage of bucket filled in the window; + // percentage of bucket filled in the window; ppBucketFilled float32 - nbBucketFilled int + nbBucketFilled int // // average ops per non-zero buckets - // averageOpsPerBucket int + // averageOpsPerBucket int } @@ -133,15 +133,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract // 2^{c} to the current digit, making it negative. - if digit > max { + if digit > max { digit -= (1 << c) carry = 1 } // if digit is zero, no impact on result if digit == 0 { - continue - } + continue + } var bits uint16 if digit > 0 { @@ -152,7 +152,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks digits[int(chunk)*len(scalars)+i] = bits } - // for the last chunk, we don't want to borrow from a next window + // for the last chunk, we don't want to borrow from a next window // (but may have a larger max value) chunk := nbChunks - 1 s := selectors[chunk] @@ -168,7 +168,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks } }, nbTasks) - + // aggregate chunk stats chunkStats := make([]chunkStat, nbChunks) @@ -179,9 +179,9 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks parallel.Execute(len(chunkStats), func(start, end int) { // for each chunk compute the statistics for chunkID := start; chunkID < end; chunkID++ { - // indicates if a bucket is hit. - var b bitSetC16 - + // indicates if a bucket is hit. + var b bitSetC16 + // digits for the chunk chunkDigits := digits[chunkID*len(scalars):(chunkID+1)*len(scalars)] @@ -189,7 +189,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks nz := 0 // non zero buckets count for _, digit := range chunkDigits { if digit == 0 { - continue + continue } totalOps++ bucketID := digit >> 1 @@ -219,7 +219,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target } } - + return digits, chunkStats } @@ -273,7 +273,7 @@ func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTas // processing in the msm in 2, to ensure all go routines finish at ~same time // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine // if it does, though, this will deadlocK. - chSmallValues := make(chan int, nbTasks) + chSmallValues := make(chan int, nbTasks) parallel.Execute(len(scalars), func(start, end int) { smallValues := 0 @@ -313,7 +313,7 @@ func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTas // if digit is zero, no impact on result if digit == 0 { - continue + continue } @@ -342,8 +342,8 @@ func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTas chSmallValues <- smallValues }, nbTasks) - - + + // aggregate small values close(chSmallValues) smallValues := 0 @@ -357,19 +357,19 @@ func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTas // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf -// +// // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *{{ $.TAffine }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig) (*{{ $.TAffine }}, error) { var _p {{$.TJacobian}} if _, err := _p.MultiExp(points, scalars, config); err != nil { - return nil, err + return nil, err } p.FromJacobian(&_p) return p, nil } // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf -// +// // This call return an error if len(scalars) != len(points) or if provided config is invalid. func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig) (*{{ $.TJacobian }}, error) { // note: @@ -446,7 +446,7 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem nbTasksPostSplit := nbChunksPostSplit*2 if (nbTasksPostSplit <= config.NbTasks /2 ) || ( nbTasksPostSplit - config.NbTasks/2 ) <= ( config.NbTasks - nbChunks) { // if postSplit we still have less tasks than available CPU - // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. + // or if we have more tasks BUT the difference of CPU usage is in our favor, we split. config.NbTasks /= 2 var _p {{ $.TJacobian }} chDone := make(chan struct{}, 1) @@ -503,8 +503,8 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T s1.add(&s2) chChunks[chunkID] <- s1 }(j) - continue - } + continue + } go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n]) } @@ -527,7 +527,7 @@ func getChunkProcessor{{ $.UPointName }}(c uint64, stat chunkStat) func(chunkID {{- else}} const batchSize = {{batchSize $c}} // here we could check some chunk statistic (deviation, ...) to determine if calling - // the batch affine version is worth it. + // the batch affine version is worth it. if stat.nbBucketFilled < batchSize { // clear indicator that batch affine method is not appropriate here. return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}] @@ -549,14 +549,12 @@ func msmReduceChunk{{ $.TAffine }}(p *{{ $.TJacobian }}, c int, chChunks []chan _p.Set(&totalj) for j := len(chChunks) - 2; j >= 0; j-- { for l := 0; l < c; l++ { - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? _p.double(&_p) } totalj := <-chChunks[j] _p.add(&totalj) } - // TODO @gbotrel / @yelhousni --> what if _p is infinity here? return p.unsafeFromJacExtended(&_p) } diff --git a/internal/generator/ecc/template/point.go.tmpl b/internal/generator/ecc/template/point.go.tmpl index 57e40eb2c4..24046a3b19 100644 --- a/internal/generator/ecc/template/point.go.tmpl +++ b/internal/generator/ecc/template/point.go.tmpl @@ -1162,6 +1162,8 @@ func (p *{{ $TJacobianExtended }}) add(q *{{ $TJacobianExtended }}) *{{ $TJacobi // double point in Jacobian extended coordinates // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 +// since we consider any point on Z=0 as the point at infinity +// this doubling formula works for infinity points as well func (p *{{ $TJacobianExtended }}) double(q *{{ $TJacobianExtended }}) *{{ $TJacobianExtended }} { var U, V, W, S, XX, M {{.CoordType}}